/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool 
          with fuzzy matching, translation memory, keyword search, 
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
               2008 Alex Buloichik
               2019 Thomas Cordonnier
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.core.matching;

import org.omegat.tokenizer.ITokenizer;
import org.omegat.util.Token;

/**
 * Calculation of compromised score
 * 
 * @author Vladimir Levenshtein
 * @author Michael Gilleland, Merriam Park Software
 * @author Chas Emerick, Apache Software Foundation
 * @author Maxym Mykhalchuk
 * @author Alex Buloichik (alex73mail@gmail.com)
 * @author Thomas Cordonnier
 */
public class ImprovedLevenshteinDistance {

    /**
     * Get minimum of three values
     */
    private static short minimum(int a, int b, int c) {
        return (short) Math.min(a, Math.min(b, c));
    }

    /** Maximal number of different items compared. */
    private static final int MAX_N = 327;

    /**
     * Cost array, horizontally. Here to avoid excessive allocation and garbage
     * collection.
     */
    private short[] d = new short[MAX_N + 1];
    /**
     * "Previous" cost array, horizontally. Here to avoid excessive allocation
     * and garbage collection.
     */
    private short[] p = new short[MAX_N + 1];

    private ITokenizer tokenizer;
    
    public ImprovedLevenshteinDistance(ITokenizer tool) { this.tokenizer = tool; }
        
    public short distance(String left, String right) {
        String[] tleft = tokenizer.tokenizeVerbatimToStrings(left), tright = tokenizer.tokenizeVerbatimToStrings(right);
        if (tleft.length > tright.length) { String[] swap = tright; tright = tleft; tleft = swap; }
        int start = 0; while (tleft[start].equals(tright[start])) 
            if (start + 1 >= tleft.length) return (short) (100 * Math.max(327, tright.length - tleft.length - start));
            else if (start + 1 >= tright.length) return (short) (100 * Math.max(327, tleft.length - tright.length - start));
            else start++;
        return distance(tleft, tright, start);
    }
        
    public short score(String left, String right) {
        String[] tleft = noSpace(tokenizer.tokenizeVerbatimToStrings(left)), tright = noSpace(tokenizer.tokenizeVerbatimToStrings(right));
        return score(tleft, tright);
    }
    
    public short score(String[] tleft, String[] tright) {
        if (tleft.length > tright.length) { String[] swap = tright; tright = tleft; tleft = swap; }
        int start = 0; while (tleft[start].equals(tright[start])) 
            if (start + 1 >= tleft.length) return (short) (100 - Math.floor(100.0 * (tright.length - tleft.length) / tright.length));
            else if (start + 1 >= tright.length) return (short) (100 - Math.floor(100.0 * (tleft.length - tright.length) / tleft.length));
            else start++;
        return (short) (100 - Math.ceil(1.0 * distance(tleft, tright, start) / Math.max(tleft.length, tright.length)));
    }

    /** Removes items with only space **/
    public String[] noSpace(String[] tab) {
        return java.util.Arrays.stream(tab).filter(item -> ! (item.trim().isEmpty())).toArray(String[]::new);
    }

    private short distance(String[] tleft, String[] tright, int start) {
        if (tleft.length == 0) return (short) (tright.length * 100);
        if (tright.length == 0) return (short) (tleft.length * 100);

        int n = tleft.length - start, m = tright.length - start; 

        if (n > MAX_N) n = MAX_N;
        if (m > MAX_N) m = MAX_N;

        short[] swap; // placeholder to assist in swapping p and d

        // indexes into p and d; compare tleft[start + i] with tright[start + j]
        short i, j; 

        String t_j = null; // jth object of t

        for (i = 0; i <= n; i++) p[i] = (short) (i * 100);
        
        // Use a distinct object, because p and d cannot be shared
        ImprovedLevenshteinDistance wordDistance = new ImprovedLevenshteinDistance(tokenizer);

        for (j = 1; j <= m; j++) {
            t_j = tright[start + j - 1]; d[0] = (short) (j * 100);

            for (i = 1; i <= n; i++)
                // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
                d[i] = minimum(d[i - 1] + 100, p[i] + 100, p[i - 1] + wordDistance.wordCost(tleft[start + i - 1], t_j, n));

            // copy current distance counts to 'previous row' distance counts
            swap = p; p = d; d = swap;
        }

        // our last action in the above loop was to switch d and p, so p now
        // actually has the most recent cost counts
        return p[n];
    }
        
    //private LevenshteinDistance lDist = new LevenshteinDistance();
        
    /** Distance between 2 words, between 0 and 100 **/
    private short wordCost (String word1, String word2, int tokensCount) {
        if (tokensCount > 40) if (word1.equals(word2)) return 0; else return 100;
        if (word1.length() > word2.length()) { String tmp = word1; word1 = word2; word2 = tmp; }
        int i1 = 0, i2 = 0, idx = 0; boolean hasDiff = false, hasCase = false, hasAmp = false; char c1, c2;
        while (i1 < word1.length()) {
            if ((c1 = word1.charAt(i1)) == '&') { i1++; hasAmp = true; continue; }
            if ((c2 = word2.charAt(i2)) == '&') { i2++; hasAmp = true; if (i2 >= word2.length()) hasDiff = true; break; }
            if (c1 == c2) { i1++; i2++; if (i1 == i2) idx = i1; continue; }
            if (Character.toUpperCase(c1) == Character.toUpperCase(c2)) { i1++; i2++; hasCase = true; continue; }
            // Still here? diff!
            hasDiff = true; break;
        }
        
        if (! hasDiff) 
            if (hasCase) return 10;
            else if (hasAmp) return 20;
            else return 0;
        if (tokensCount > 10) return 100;
                
        double cost = 30.0 * wordDistance(word1, word2, idx) / Math.max(word1.length(), word2.length());
        if (cost < 10.0) {	// check tokenized version
            Token[] tok1 = tokenizer.tokenizeWords(word1, ITokenizer.StemmingMode.GLOSSARY), tok2 = tokenizer.tokenizeWords(word2, ITokenizer.StemmingMode.GLOSSARY);
            //if (lDist.compute(tok1, tok2) == 0) return 40;
            if (tok1.length == tok2.length) {
                boolean found = false;
                for (int i = 0; i < tok1.length; i++) if (! tok1[i].equals(tok2[i])) { found = true; break; }
                if (!found) return 40;
            }
        }
        return (short) Math.floor(cost + 70.0);
    }
        
    /** Distance between 2 words, between 0 and 100 **/
    private short wordDistance (String tleft, String tright, int start) {
        if (tleft.length() == 0) return (short) tright.length();
        if (tright.length() == 0) return (short) tleft.length();

        int n = tleft.length() - start, m = tright.length() - start; 

        if (n > 100) n = 100;
        if (m > 100) m = 100;

        short[] swap; // placeholder to assist in swapping p and d

        // indexes into strings left and right
        short i, j; 

        char t_j = ' '; // jth object of t

        for (i = 0; i <= n; i++) p[i] = i;
        
        for (j = 1; j <= m; j++) {
            t_j = tright.charAt(start + j - 1); d[0] = j;

            for (i = 1; i <= n; i++)
                // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
                d[i] = minimum(d[i - 1] + 1, p[i] + 1, p[i - 1] + (tleft.charAt(start + i - 1) == t_j ? 0 : 1));

            // copy current distance counts to 'previous row' distance counts
            swap = p; p = d; d = swap;
        }

        // our last action in the above loop was to switch d and p, so p now
        // actually has the most recent cost counts
        return p[n];
    }
        
    public static void main(String[] args) {
        String left = args[0], right = args[1];
        ITokenizer tok = new org.omegat.tokenizer.LuceneFrenchTokenizer();
        ImprovedLevenshteinDistance testSelf = new ImprovedLevenshteinDistance(tok);
        long now = System.currentTimeMillis();
        System.err.println("Left tokens: " + String.join(",", tok.tokenizeVerbatimToStrings(left)));
        System.err.println("Right tokens: " + String.join(",", tok.tokenizeVerbatimToStrings(right)));
        
        ISimilarityCalculator testOther = new LevenshteinDistance();
        Token[] tokLeft, tokRight;
        tokLeft = tok.tokenizeVerbatim(left); tokRight = tok.tokenizeVerbatim(right);
        now = System.currentTimeMillis(); System.err.println("Adjusted distance = " + testOther.compute(tokLeft,tokRight));
        System.err.println("Took " + (System.currentTimeMillis() - now) + " ms");
        now = System.currentTimeMillis(); System.err.println("Adjusted score = " + testOther.calcSimilarity(tokLeft,tokRight));
        System.err.println("Took " + (System.currentTimeMillis() - now) + " ms");
        tokLeft = tok.tokenizeWords(left, ITokenizer.StemmingMode.MATCHING); tokRight = tok.tokenizeWords(right, ITokenizer.StemmingMode.MATCHING);
        now = System.currentTimeMillis(); System.err.println("Stemmed distance = " + testOther.compute(tokLeft,tokRight));
        System.err.println("Took " + (System.currentTimeMillis() - now) + " ms");
        now = System.currentTimeMillis(); System.err.println("Stemmed score = " + testOther.calcSimilarity(tokLeft,tokRight));
        System.err.println("Took " + (System.currentTimeMillis() - now) + " ms");
        tokLeft = tok.tokenizeWords(left, ITokenizer.StemmingMode.NONE); tokRight = tok.tokenizeWords(right, ITokenizer.StemmingMode.NONE);
        now = System.currentTimeMillis(); System.err.println("Unstemmed distance = " + testOther.compute(tokLeft,tokRight));
        System.err.println("Took " + (System.currentTimeMillis() - now) + " ms");
        now = System.currentTimeMillis(); System.err.println("Unstemmed score = " + testOther.calcSimilarity(tokLeft,tokRight));
        System.err.println("Took " + (System.currentTimeMillis() - now) + " ms");
        
        System.err.println(" *** ");
        now = System.currentTimeMillis(); System.err.println("Improved distance = " + testSelf.distance(left,right));
        System.err.println("Took " + (System.currentTimeMillis() - now) + " ms");
        now = System.currentTimeMillis(); System.err.println("Improved score = " + testSelf.score(left,right));
        System.err.println("Took " + (System.currentTimeMillis() - now) + " ms");
        if ((tokLeft.length == 1) && (tokRight.length == 1)) {
            now = System.currentTimeMillis(); System.err.println("wordDistance = " + testSelf.wordDistance(left,right, 0));
            System.err.println("Took " + (System.currentTimeMillis() - now) + " ms");
            now = System.currentTimeMillis(); System.err.println("wordCost = " + testSelf.wordCost(left,right, 0));
            System.err.println("Took " + (System.currentTimeMillis() - now) + " ms");
        }
    }
    
}
