/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool 
          with fuzzy matching, translation memory, keyword search, 
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2015 Thomas CORDONNIER
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.core.matching.external.lucene;

import java.util.regex.*;
import java.util.*;
import java.io.*;

import org.omegat.core.Core;
import org.omegat.core.matching.external.IBrowsableMemory;
import org.omegat.core.matching.external.IEntryCursor;
import org.omegat.core.search.ISearchable;
import org.omegat.core.search.TextExpression;
import org.omegat.core.data.PrepareTMXEntry;
import org.omegat.util.Language;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.TermQuery;

public class LuceneReader implements IBrowsableMemory, ISearchable {

    //public static final DateFormat dateFormat = new SimpleDateFormat("yyyyMMdd'T'HHmmss'Z'", Locale.ENGLISH);;
    
    private final NgramAnalyzer analyzer = new NgramAnalyzer();

    private IndexReader indexReader;
    private IndexSearcher searcher;
    private String name;
    private String srcIndex = "src", traIndex = "tra";

    public LuceneReader (java.util.Properties prop) throws IOException {
        this (new File(prop.getProperty("dir")));
    }
    
    public LuceneReader(File indexDirectory) throws IOException {
		System.err.println("Reading Lucene : " + indexDirectory);
        if (! indexDirectory.isDirectory()) indexDirectory = indexDirectory.getParentFile();
        name = indexDirectory.getPath();
        indexReader = IndexReader.open(FSDirectory.open(indexDirectory));
        searcher = new IndexSearcher(indexReader);
        Collection<String> fields =  org.apache.lucene.util.ReaderUtil.getIndexedFields(indexReader);
        if (! fields.contains("src")) // non OmegaT
            if (fields.contains("x-src.t")) {
                srcIndex = "x-src.t"; traIndex = "x-tra.t";
            } else { // Exilis index
                srcIndex = Core.getProject().getProjectProperties().getSourceLanguage().toString().toUpperCase() + ".t";
                traIndex = Core.getProject().getProjectProperties().getTargetLanguage().toString().toUpperCase() + ".t";
            }
        System.out.println("For indexed TM " + name + " : source field = " + srcIndex + ", target index = " + traIndex);
        File stopFile = new File (indexDirectory, "stop"); // For OmegaT's format
        if (stopFile.exists()) analyzer.loadstopNgrams(stopFile.getPath(), "src");
        else {
            stopFile = new File (indexDirectory, "stop.lists"); // For Exilis format
            if (stopFile.exists()) { // small indexes may not use stopwords at all
                analyzer.loadstopNgrams(stopFile.getPath(), srcIndex);
            }
        }
    }
    
    public String getProviderName() {
        if (srcIndex.equals("src")) return "Lucene"; else return "Exilis";
    }
    
    public String getMemoryName() {
        return name;
    }
    
    private BooleanQuery buildQuery (String text, BooleanClause.Occur occur) throws IOException {
        final BooleanQuery query = new BooleanQuery();
        TokenStream queryTokenStream = analyzer.tokenStream(srcIndex, new StringReader(text));
        TermAttribute termAtt = queryTokenStream.addAttribute(TermAttribute.class);
        queryTokenStream.reset(); 
        Set<String> terms = new HashSet<String>();
        while (queryTokenStream.incrementToken()) terms.add (termAtt.term());
        for (String current: terms)
            query.add(new TermQuery(new Term(srcIndex, current)), occur);
        queryTokenStream.end(); queryTokenStream.close();
        return query;
    }
	
    public Iterable<PrepareTMXEntry> findMatchingTranslations (
        Language sLang, Language tLang, String text, 
        int minScore, final int maxCount
    ) throws Exception {
        final BooleanQuery query = buildQuery(text, BooleanClause.Occur.SHOULD);
        int minShould = query.clauses().size() * minScore / 200;
        if (minShould > 1) query.setMinimumNumberShouldMatch (minShould); // minScore as %, divided by 2.              
        return executeQuery (query, maxCount);
    }
    
    private Iterable<PrepareTMXEntry> executeQuery (final BooleanQuery query, final int maxCount) throws IOException {
        final TopScoreDocCollector topCollector;
        topCollector = TopScoreDocCollector.create(maxCount, true); 
        searcher.search(query, topCollector);
        final List<ScoreDoc> aList = new ArrayList<ScoreDoc>(); 
        for (ScoreDoc sdoc: topCollector.topDocs().scoreDocs) aList.add(sdoc);
        return () -> new Iterator<PrepareTMXEntry>() {
            private Iterator<ScoreDoc> sdocs = aList.iterator();
                
            public boolean hasNext() { return sdocs.hasNext(); }
            public PrepareTMXEntry next() {
                try {
                    Document doc = searcher.doc (sdocs.next().doc);
                    return toEntry (doc);
                } catch (Exception cie) {
                    throw new RuntimeException (cie);
                }
            }
        };
    }
    
    @Override
    public IEntryCursor browseAllEntries() {		
        int max1 = indexReader.maxDoc() - 1;
        while ((max1 > 0) && (indexReader.isDeleted(max1))) max1--;
        final int max = max1;
        
        return new IEntryCursor() {
            private int pos = -1;
            private Document currentDoc;
            
            public boolean next() throws CorruptIndexException, IOException  { 
                if (pos >= max) return false;
                pos++; if (indexReader.isDeleted(pos)) return next(); 
                currentDoc = indexReader.document(pos); propsIterator = null; return true;
            }
            
            // Next methods retreive entry content 
            // For simplification, we do not make distinction between internal/external TMX and glossary entries
                
            public String getEntrySource() { return currentDoc.get("src"); }
            public String getEntryTranslation() { return currentDoc.get("tra"); }		
            public String getEntryNote() { return currentDoc.get("note"); }
            public String getEntryAuthor() { return currentDoc.get("author"); }
            public String getEntryLastModifier() { return currentDoc.get("changer"); }		
            
            public long getEntryCreationDate() throws Exception { 
                return LuceneWriter.dateFormat.parse(currentDoc.get("creationDate")).getTime(); 
            }
            public long getEntryLastModificationDate() throws Exception { 
                return LuceneWriter.dateFormat.parse(currentDoc.get("changeDate")).getTime(); 
            }
            
            // Iterator through properties
            
            private Iterator<Fieldable> propsIterator = null;
            private Fieldable currentField = null;
        
            public boolean nextProperty() {
                if (propsIterator == null) {
                    List<Fieldable> l1 = currentDoc.getFields(); 
                    List<Fieldable> l2 = new ArrayList<> (l1.size() > 8 ? l1.size() - 7 : l1.size());
                    for (Fieldable f: l1) if (f.name().startsWith("prop.")) l2.add(f);
                    propsIterator = l2.iterator();
                }
                if (propsIterator.hasNext()) { currentField = propsIterator.next(); return true; } 
                else return false;
            }
        
            public String getCurrentPropertyName() { return currentField.name(); }
            public String getCurrentPropertyValue() { return currentField.stringValue(); }
        };
    }
    
    private static final Pattern LITTERAL = Pattern.compile("(?<!\\\\)[^\\\\\\(\\)\\{\\}\\[\\]\\+\\-\\*]{4}");

    @Override
    public Iterable<PrepareTMXEntry> search(int numberOfResults, TextExpression searchSource, TextExpression searchTarget, TextExpression searchNotes, boolean andSearch,
        TextExpression author, // NOTE: does not take DGT-specific "translator" fields, because they would be meaningless for external providers
        long dateAfter, long dateBefore) throws CorruptIndexException, IOException {
        
        // Only the source is indexed. Since expression uses OR between source, target and notes, we can use index only on source-only searches!
        if (searchSource == null) return searchUnindexed(numberOfResults, searchSource, searchTarget, searchNotes, andSearch, author, dateAfter, dateBefore);
        if ((!andSearch) && (searchTarget != null)) return searchUnindexed(numberOfResults, searchSource, searchTarget, searchNotes, andSearch, author, dateAfter, dateBefore);
        if ((!andSearch) && (searchNotes != null)) return searchUnindexed(numberOfResults, searchSource, searchTarget, searchNotes, andSearch, author, dateAfter, dateBefore);
        
        if (searchSource instanceof TextExpression.RegexTextExpression) {
            String pattern = ((TextExpression.RegexTextExpression) searchSource).getPattern().pattern();
            // In a regex, only the litteral parts can be indexed
            final BooleanQuery query = new BooleanQuery();
            Matcher m = LITTERAL.matcher (pattern);
            while (m.find()) query.add(buildQuery(m.group(), BooleanClause.Occur.MUST), BooleanClause.Occur.MUST);
            if (query.iterator().hasNext()) return executeQuery (query, numberOfResults);
        }
        
        if (searchSource instanceof TextExpression.WordsTextExpression) {
            final BooleanQuery query = new BooleanQuery();
            for (TextExpression.RegexTextExpression word: ((TextExpression.WordsTextExpression) searchSource).split()) {
                // OR search (should) but each word is built using MUST
                String pattern = word.getPattern().pattern();
                // In a regex, only the litteral parts can be indexed
                Matcher m = LITTERAL.matcher (pattern);
                while (m.find()) query.add(buildQuery(m.group(), BooleanClause.Occur.MUST), BooleanClause.Occur.MUST);				
            }
            if (query.iterator().hasNext()) return executeQuery (query, numberOfResults);
        }
        
        // No lemma search (would need a distinct index)
        return searchUnindexed(numberOfResults, searchSource, searchTarget, searchNotes, andSearch, author, dateAfter, dateBefore);
    }
    
    public Iterable<PrepareTMXEntry> searchUnindexed(int numberOfResults, TextExpression searchSource, TextExpression searchTarget, TextExpression searchNotes, boolean andSearch,
        TextExpression author, // NOTE: does not take DGT-specific "translator" fields, because they would be meaningless for external providers
        long dateAfter, long dateBefore) throws CorruptIndexException, IOException {
        
        List<PrepareTMXEntry> result = new ArrayList<>();
        try {
            IEntryCursor entry = browseAllEntries(); 
            while (entry.next()) {
                // check author filters
                if (author != null) { 
                    String au = entry.getEntryLastModifier(); if (au != null) { if (! author.matchesString(au)) continue; }
                    else { au = entry.getEntryAuthor(); if (au != null) if (! author.matchesString(au)) continue; }
                }
                // check text filters
                int cpt = 0;
                if ((searchSource == null) || (searchSource.matchesString(entry.getEntrySource()))) cpt++;
                if ((searchTarget == null) || (searchTarget.matchesString(entry.getEntryTranslation()))) cpt++;
                if ((searchNotes == null) || (searchNotes.matchesString(entry.getEntryNote()))) cpt++;
                // reject if text filters fail
                if (andSearch) { if (cpt < 3) continue; } else { if (cpt < 1) continue; }  
                // still here? add the result
                result.add(entry.toPrepareTMXEntry()); if (result.size() >= numberOfResults) return result;
            }
        } finally {
            return result;
        }
    }
    
    private PrepareTMXEntry toEntry (Document doc) {
        PrepareTMXEntry entry = new PrepareTMXEntry();
        entry.source = doc.get(srcIndex);
        entry.translation = doc.get(traIndex);
        entry.creator = doc.get("author");
        entry.changer = doc.get("changer");
        //try { entry.creationDate = dateFormat.parse(doc.get("creationDate")); } catch (Exception e1) {}
        //try { entry.changeDate = dateFormat.parse(doc.get("changeDate")); } catch (Exception e1) {}
        return entry;
    }
}
