/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool 
          with fuzzy matching, translation memory, keyword search, 
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
               2008 Alex Buloichik
               2012 Thomas Cordonnier, Martin Fleurke
               2013 Aaron Madlon-Kay, Alex Buloichik, Thomas Cordonnier
               2015-2021 Thomas Cordonnier
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.core.matching;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.omegat.core.Core;
import org.omegat.core.data.EntryKey;
import org.omegat.core.data.ExternalTMX;
import org.omegat.core.data.IProject;
import org.omegat.core.data.IProject.DefaultTranslationsIterator;
import org.omegat.core.data.IProject.MultipleTranslationsIterator;
import org.omegat.core.data.ITMXEntry;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.data.StringData;
import org.omegat.core.data.TMXEntry;
import org.omegat.core.events.IStopped;
import org.omegat.core.matching.external.IExternalMemory;
import org.omegat.core.segmentation.Rule;
import org.omegat.tokenizer.ITokenizer;
import org.omegat.tokenizer.DefaultTokenizer;
import org.omegat.util.Language;
import org.omegat.util.Log;
import org.omegat.util.OConsts;
import org.omegat.util.OStrings;
import org.omegat.util.PatternConsts;
import org.omegat.util.Preferences;
import org.omegat.util.TMXProp;
import org.omegat.util.Token;

/**
 * Class to find matches by specified criteria.
 * 
 * Since we can use stemmers to prepare tokens, we should use 3-pass comparison of similarity. Similarity will
 * be calculated in 3 steps:
 * 
 * 1. Split original segment into word-only tokens using stemmer (with stop words list), then compare tokens.
 * 
 * 2. Split original segment into word-only tokens without stemmer, then compare tokens.
 * 
 * 3. Split original segment into not-only-words tokens (including numbers and tags) without stemmer, then
 * compare tokens.
 * 
 * This class is not thread safe ! Must be used in the one thread only.
 * 
 * @author Maxym Mykhalchuk
 * @author Alex Buloichik (alex73mail@gmail.com)
 * @author Martin Fleurke
 * @author Aaron Madlon-Kay
 * @author Thomas Cordonnier
 */
public abstract class FindMatches {
    protected NearString.SORT_KEY sortKey = Preferences.getPreferenceEnumDefault(Preferences.EXT_TMX_SORT_KEY, NearString.SORT_KEY.SCORE);

    private static final int PENALTY_FOR_FUZZY = 20;
    private static final int PENALTY_FOR_REMOVED = 5;
    private static final int SUBSEGMENT_MATCH_THRESHOLD = 85;

    private static final boolean ALLOW_PARTIALY_MATCH = true;

    private final ISimilarityCalculator distance = new LevenshteinDistance();

    /**
     * the removePattern that was configured by the user.
     */
    private final Pattern removePattern = PatternConsts.getRemovePattern();

    private static final ITokenizer ALT_TOKENIZER = new org.omegat.tokenizer.DefaultTokenizer();
    private ITokenizer tok;
    private final Locale srcLocale;

    private final boolean searchExactlyTheSame;
    private String originalText;
    private String srcText;

    /**
     * Text that was removed by the removePattern from the source text.
     */
    private String removedText;

    /** Tokens for original string, with and without stems. */
    private Token[] strTokensStem, strTokensNoStem;

    /** Tokens for original string, includes numbers and tags. */
    private Token[] strTokensAll;

    // This finder used for search separate segment matches
    FindMatchesSingleton separateSegmentMatcher;

    // External memories
    private final IExternalMemory[] externalMemories;
    
    private String memoryPath = null;

    public FindMatches(ITokenizer sourceTokenizer, final IExternalMemory[] externalMemories, String memoryPath, boolean allowSeparateSegmentMatch, boolean searchExactlyTheSame) {
        this(sourceTokenizer, externalMemories, allowSeparateSegmentMatch, searchExactlyTheSame); this.memoryPath = memoryPath;
    }
    
    /**
     * @param searchExactlyTheSame
     *            allows to search similarities with the same text as source segment. This mode used only for
     *            separate sentence match in paragraph project, i.e. where source is just part of current
     *            source.
     */
    public FindMatches(ITokenizer sourceTokenizer, final IExternalMemory[] externalMemories, boolean allowSeparateSegmentMatch, boolean searchExactlyTheSame) {
        tok = sourceTokenizer;
        srcLocale = Core.getProject().getProjectProperties().getSourceLanguage().getLocale();
        this.searchExactlyTheSame = searchExactlyTheSame;
        this.externalMemories = externalMemories;
        if (allowSeparateSegmentMatch) {
            separateSegmentMatcher = new FindMatchesSingleton(sourceTokenizer, externalMemories, false, true);
        }
    }
    
    /**
     * Builds the similarity data for color highlight in match window.
     */
    protected void buildSimilarityData(NearString near) {
        Token[] matchTokens = tokenizeAll(near.source);
        
        int len = matchTokens.length;
        byte[] result = new byte[len];

        boolean leftfound = true;
        for (int i = 0; i < len; i++) {
            result[i] = 0;

            Token righttoken = null;
            if (i + 1 < len)
                righttoken = matchTokens[i + 1];
            boolean rightfound = (i + 1 == len) || DefaultTokenizer.isContains(strTokensAll, righttoken);

            Token token = matchTokens[i];
            boolean found = DefaultTokenizer.isContains(strTokensAll, token);

            if (found && (!leftfound || !rightfound))
                result[i] = StringData.PAIR;
            else if (!found)
                result[i] = StringData.UNIQ;

            leftfound = found;
        }
        
        near.attr = result;
    }
    
    protected int FUZZY_MATCH_THRESHOLD = Preferences.getPreferenceDefault(Preferences.EXT_TMX_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD);

    protected void doSearch(final IProject project, final String searchText,
            final boolean requiresTranslation, final boolean fillSimilarityData, final IStopped stop)
            throws StoppedException {
        originalText = searchText;
        srcText = searchText;

        this.removedText = "";
        // remove part that is to be removed according to user settings.
        // Rationale: it might be a big string influencing the 'editing distance', while it is not really part
        // of the translatable text
        if (removePattern != null) {
            Matcher removeMatcher = removePattern.matcher(srcText);
            while (removeMatcher.find()) {
                removedText += srcText.substring(removeMatcher.start(), removeMatcher.end());
            }
            srcText = removeMatcher.replaceAll("");
        }
        // get tokens for original string
        strTokensStem = tokenizeStem(srcText); strTokensNoStem = tokenizeNoStem(srcText); strTokensAll = tokenizeAll(srcText);
        ITokenizer tempTokenizer = tok;
        if ((strTokensStem.length == 0) || (strTokensNoStem.length == 0)) {
            // The tokenizer has a bug, use default. Else, no match can be found because these tokenizers have priority against "all"
            this.tok = ALT_TOKENIZER; // note: will also affect candidate tokens, but not future searches with other segments
            tokenizeStemCache.clear(); tokenizeNoStemCache.clear(); tokenizeAllCache.clear(); // avoid getting the wrong results from the cache
            // now re-tokenize with the new (default) tokenizer
            strTokensStem = tokenizeStem(srcText); strTokensNoStem = tokenizeNoStem(srcText); strTokensAll = tokenizeAll(srcText);
        }
        /* HP: includes non - word tokens */

        if (memoryPath == null) searchInProject(project, requiresTranslation, stop);
        
        // travel by external translation memories
        Language sourceLang = project.getProjectProperties().getSourceLanguage();
        Language targetLang = project.getProjectProperties().getTargetLanguage();
        final String prefForeign = Preferences.getPreferenceDefault(Preferences.EXT_TMX_KEEP_FOREIGN_MATCH, "30 false");

        // travel by tmx translation memories
        Pattern SEARCH_FOR_PENALTY = Pattern.compile("penalty-(\\d+)");
        for (Map.Entry<String, IExternalMemory> en : project.getTransMemories().entrySet()) {
            if ((memoryPath != null) && (! en.getKey().contains(memoryPath))) continue;
            int penalty = 0;
            Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
            if (matcher.find()) {
                penalty = Integer.parseInt(matcher.group(1));
            }
            try {
                for (ITMXEntry tmen : en.getValue().findMatchingTranslations(sourceLang, targetLang, 
                        originalText, this.FUZZY_MATCH_THRESHOLD, maxCount())) {
                    checkStopped(stop);
                    if (tmen.getSourceText() == null) continue; // Not all TMX entries have a source; in that case there can be no meaningful match, so skip.
                    if (requiresTranslation && tmen.getTranslationText() == null) continue;
                    int penalty1 = 0; try { penalty1 = Integer.parseInt(tmen.getPropValue("omegat.penalty")); } catch (Exception ep){}
                    int penalty2 = 0; String traLang2 = tmen.getPropValue("x-target-lang"); if (traLang2 != null) {
                        matcher = PatternConsts.LANG_AND_COUNTRY.matcher(traLang2); if (matcher.find()) traLang2 = matcher.group(1);
                        if (! traLang2.equalsIgnoreCase(targetLang.getLanguageCode())) {
                            if (prefForeign.endsWith("false")) continue;
                            if (prefForeign.contains(";")) {
                                String[] prefs = prefForeign.split(";");
                                for (int i = 0; i < prefs.length - 1; i++) {
                                    String[] parts = prefs[i].split(" ");
                                    matcher = PatternConsts.LANG_AND_COUNTRY.matcher(parts[0]); if (matcher.find()) parts[0] = matcher.group(1);
                                    if (parts[0].equalsIgnoreCase(traLang2)) 
                                        try { penalty2 = Integer.parseInt(parts[1]); break; } catch (Exception e) {}
                                }
                            }
                            if (penalty2 == 0)
                                try { 
                                    String pref = prefForeign; if (pref.contains(";")) pref = pref.substring(pref.lastIndexOf(';') + 1);
                                    penalty2 = Integer.parseInt(pref.substring(0, pref.indexOf(' '))); 
                                 } catch (Exception e) {}
                        }
                    }
                    processEntry(null, tmen.getSourceText(), tmen.getTranslationText(), NearString.MATCH_SOURCE.TM, false, penalty + penalty1 + penalty2,
                            en.getValue().getMemoryName(), tmen.getCreator(), tmen.getCreationDate(), tmen.getChanger(), tmen.getChangeDate(), tmen.getPropValue("revisor"), 
                            tmen.getNote(), tmen.getProperties());
                }
            } catch (StoppedException stopex) {
                throw stopex;
            } catch (Exception e) {
                Log.log (e); // log, but continue.
            }
        }
        
        for (IExternalMemory provider: externalMemories) {
            try {
                if ((memoryPath != null) && (! provider.getMemoryName().contains(memoryPath))) continue;
                for (ITMXEntry entry: provider.findMatchingTranslations(sourceLang, targetLang, 
                        originalText, OConsts.FUZZY_MATCH_THRESHOLD, maxCount())) {
                    checkStopped(stop);
                    processEntry(null, entry.getSourceText(), entry.getTranslationText(), 
                        NearString.MATCH_SOURCE.TM, false, 0, provider.getMemoryName(), 
                        entry.getCreator(), entry.getCreationDate(), entry.getChanger(), entry.getChangeDate(), 
                        entry.getPropValue("revisor"), entry.getNote(), entry.getProperties());
                }
            } catch (StoppedException stopex) {
                throw stopex;
            } catch (Exception e) {
                Log.log (e); // log, but continue.
            }
        }        
        
        // travel by all entries for check source file translations
        for (SourceTextEntry ste : project.getAllEntries()) {
            checkStopped(stop);
            if ((memoryPath != null) && (ste.getKey() != null) && (ste.getKey().file != null) && ste.getKey().file.contains(memoryPath) )  continue;
            if (ste.getSourceTranslation() != null) {
                processEntry(ste.getKey(), ste.getSrcText(), ste.getSourceTranslation(),
                        NearString.MATCH_SOURCE.MEMORY, ste.isSourceTranslationFuzzy(), 0, ste.getKey().file,
                        "", 0, "", 0, "", ste.getComment(), null);
            }
        }

        if (ALLOW_PARTIALY_MATCH && (separateSegmentMatcher != null) && (memoryPath == null)
                && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
            // split paragraph even when segmentation disabled, then find matches for every segment
            List<StringBuilder> spaces = new ArrayList<StringBuilder>();
            List<Rule> brules = new ArrayList<Rule>();
            List<String> segments = Core.getSegmenter().segment(sourceLang, srcText, spaces, brules);
            if (segments.size() > 1) { // multiple segments
                List<String> fsrc = new ArrayList<String>(segments.size());
                List<String> ftrans = new ArrayList<String>(segments.size());
                int maxPenalty = 0;                
                for (short i = 0; i < segments.size(); i++) {
                    String onesrc = segments.get(i);

                    // find match for separate segment
                    NearString segmentMatch = separateSegmentMatcher.search(project, onesrc,
                            requiresTranslation, false, stop);
                    if (segmentMatch != null
                            && segmentMatch.score >= SUBSEGMENT_MATCH_THRESHOLD) {
                        fsrc.add(segmentMatch.source);
                        ftrans.add(segmentMatch.translation);
                        if (segmentMatch.fuzzyMark)
                            if (maxPenalty < PENALTY_FOR_FUZZY) maxPenalty = PENALTY_FOR_FUZZY;
                        Matcher matcher = SEARCH_FOR_PENALTY.matcher(segmentMatch.proj);
                        if (matcher.find()) 
                            try {
                                int penalty = Integer.parseInt(matcher.group(1));
                                if (penalty > maxPenalty) maxPenalty = penalty;                                
                            } catch (Exception ep) {
                            }
                    } else {
                        fsrc.add("");
                        ftrans.add("");
                    }
                }
                // glue found sources
                String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
                // glue found translations
                String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
                processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, maxPenalty, "Sub-segment match", 
                        "", 0, "", 0, "", "", null);
            }
        }

        tok = tempTokenizer;
    }
    
    private void searchInProject(final IProject project, final boolean requiresTranslation, final IStopped stop) {
        final String orphanedFileName = OStrings.getString("CT_ORPHAN_STRINGS");

        // travel by project entries, including orphaned
        if (project.getProjectProperties().isSupportDefaultTranslations())
            project.iterateByDefaultTranslations((String source, TMXEntry trans) -> {
                checkStopped(stop);
                if (!searchExactlyTheSame && source.equals(originalText)) return; // skip original==original entry comparison
                
                if (requiresTranslation && trans.translation == null) return;
                String fileName = project.isOrphaned(source) ? orphanedFileName : null;
                processEntry(null, source, trans.translation, NearString.MATCH_SOURCE.MEMORY, false, 0,
                        fileName, trans.creator, trans.creationDate, trans.changer, trans.changeDate,
                        trans.revisor, trans.note, null);
            });
        project.iterateByMultipleTranslations((EntryKey source, TMXEntry trans) -> {
                checkStopped(stop);
                if (!searchExactlyTheSame && source.sourceText.equals(originalText)) return; // skip original==original entry comparison
                if (requiresTranslation && trans.translation == null) return;
                String fileName = project.isOrphaned(source) ? orphanedFileName : null;
                processEntry(source, source.sourceText, trans.translation, NearString.MATCH_SOURCE.MEMORY,
                        false, 0, fileName, trans.creator, trans.creationDate, trans.changer,
                        trans.changeDate, trans.revisor, trans.note, null);
            });        
    }
    
    /**
     * Compare one entry with original entry.
     * 
     * @param candEntry
     *            entry to compare
     */
    protected void processEntry(final EntryKey key, final String source, final String translation,
            NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, final int penalty, final String tmxName,
            final String creator, final long creationDate, final String changer, final long changedDate,
            final String revisor, final String note, final List<TMXProp> props) {
        // remove part that is to be removed prior to tokenize
        String realSource = source;
        String entryRemovedText = "";
        int totalPenalty = penalty; if (fuzzy) totalPenalty += PENALTY_FOR_FUZZY;
        if (this.removePattern != null) {
            Matcher removeMatcher = removePattern.matcher(realSource);
            while (removeMatcher.find()) {
                entryRemovedText += source.substring(removeMatcher.start(), removeMatcher.end());
            }
            realSource = removeMatcher.replaceAll("");
            // calculate penalty if something has been removed, otherwise different strings get 100% match.
            if (!entryRemovedText.equals(this.removedText)) {
                // penalty for different 'removed'-part
                totalPenalty += PENALTY_FOR_REMOVED;
            }
        }
        
        evalEntry(key, source, translation, realSource, totalPenalty, fuzzy, comesFrom, 
                tmxName, creator, creationDate, changer, changedDate, revisor, note, props);
    }
    
    protected abstract void evalEntry(final EntryKey key, final String source, final String translation,
            String realSource, int totalPenalty, boolean fuzzy,
            NearString.MATCH_SOURCE comesFrom, final String tmxName,
            final String creator, final long creationDate, final String changer, final long changedDate,
            final String revisor, final String note, final List<TMXProp> props);
            
    public abstract int maxCount();
    
    protected final int calcSimilarityStem(String realSource) { return distance.calcSimilarity(strTokensStem, tokenizeStem(realSource)); }        
    protected final int calcSimilarityNoStem(String realSource) { return distance.calcSimilarity(strTokensNoStem, tokenizeNoStem(realSource)); }        
    protected final int calcSimilarityAdjusted(String realSource) { return distance.calcSimilarity(strTokensAll, tokenizeAll(realSource)); }

    /*
     * Methods for tokenize strings with caching.
     */
    Map<String, Token[]> tokenizeStemCache = new HashMap<String, Token[]>();
    Map<String, Token[]> tokenizeNoStemCache = new HashMap<String, Token[]>();
    Map<String, Token[]> tokenizeAllCache = new HashMap<String, Token[]>();

    public Token[] tokenizeStem(String str) {
        Token[] result = tokenizeStemCache.get(str);
        if (result == null) {
            result = tok.tokenizeWords(str, ITokenizer.StemmingMode.MATCHING);
            tokenizeStemCache.put(str, result);
        }
        return result;
    }

    public Token[] tokenizeNoStem(String str) {
        // No-stemming token comparisons are intentionally case-insensitive
        // for matching purposes.
        str = str.toLowerCase(srcLocale);
        Token[] result = tokenizeNoStemCache.get(str);
        if (result == null) {
            result = tok.tokenizeWords(str, ITokenizer.StemmingMode.NONE);
            tokenizeNoStemCache.put(str, result);
        }
        return result;
    }

    public Token[] tokenizeAll(String str) {
        // Verbatim token comparisons are intentionally case-sensitive,
        // so that strings differing only by case have almost one of the 3 scores penalized;
        // OmegaT 3 explicitly made it case insensitive because "This is how OmegaT behaves since at least 1.8" (https://sourceforge.net/p/omegat/bugs/755/)
        // but it is wrong because in OmegaT 2.6, adjusted score was case sensitive!
        Token[] result = tokenizeAllCache.get(str);
        if (result == null) {
            result = tok.tokenizeVerbatim(str);
            tokenizeAllCache.put(str, result);
        }
        return result;
    }

    protected void checkStopped(IStopped stop) throws StoppedException {
        if (stop.isStopped()) {
            throw new StoppedException();
        }
    }

    /**
     * Process will throw this exception if it stopped.All callers must catch it and just skip.
     */
    @SuppressWarnings("serial")
    public static class StoppedException extends RuntimeException {
    }
}
