/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool 
          with fuzzy matching, translation memory, keyword search, 
          glossaries, and translation leveraging into updated projects.
 
 Copyright (C) 2015 Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.tokenizer;

import org.apache.commons.lang.StringUtils;
import org.omegat.tokenizer.ITokenizer.StemmingMode;
import org.omegat.util.Token;

import junit.framework.TestCase;

public class TokenizerTest extends TestCase {

    public void testEnglish() {
        ITokenizer tok = new LuceneEnglishTokenizer();
        String orig = "The quick, brown <x0/> jumped over 1 \"lazy\" dog.";
        assertVerbatim(new String[] { "The", " ", "quick", ",", " ", "brown", " ", "<x0/>", " ",
                "jumped", " ", "over", " ", "1", " ", "\"", "lazy", "\"", " ", "dog", "." },
                tok.tokenizeVerbatimToStrings(orig),
                tok.tokenizeVerbatim(orig),
                orig);
        assertResult(new String[] { "The", "quick", "brown", "jumped", "over", "lazy", "dog" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
        assertResult(new String[] { "the", "quick", "brown", "x0", "jump", "jumped", "over", "1", "lazi", "lazy", "dog" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
        assertResult(new String[] { "quick", "brown", "jump", "jumped", "lazi", "lazy", "dog" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
    }
       
    /**
     * Turkish warrants special testing because it has the letter \u0130
     * (LATIN CAPITAL LETTER I WITH DOT ABOVE); the result (both content
     * and length) of performing <code>"\u0130".toLowerCase()</code> depends
     * on the default Locale, and in the past there were issues with improper
     * lowercasing during tokenization leading to OOB exceptions.
     * <p>
     * Text from https://tr.wikipedia.org/wiki/T%C3%BCrk%C3%A7e
     */
    public void testTurkish() {
        ITokenizer tok = new LuceneTurkishTokenizer();
        String orig = "\u201C\u0130stanbul a\u011Fz\u0131\u201D, T\u00FCrkiye T\u00FCrk\u00E7esi"
                + "yaz\u0131 dilinin kayna\u011F\u0131 olarak kabul edilir; yaz\u0131 dili bu"
                + "a\u011F\u0131z temelinde olu\u015Fmu\u015Ftur.";
        assertVerbatim(new String[] { "\u201C", "\u0130stanbul", " ", "a\u011Fz\u0131", "\u201D",
                ",", " ", "T\u00FCrkiye", " ", "T\u00FCrk\u00E7esiyaz\u0131", " ", "dilinin", " ",
                "kayna\u011F\u0131", " ", "olarak", " ", "kabul", " ", "edilir", ";", " ", "yaz\u0131",
                " ", "dili", " ", "bua\u011F\u0131z", " ", "temelinde", " ", "olu\u015Fmu\u015Ftur", "." },
                tok.tokenizeVerbatimToStrings(orig),
                tok.tokenizeVerbatim(orig),
                orig);
        assertResult(new String[] { "\u0130stanbul", "a\u011Fz\u0131", "T\u00FCrkiye",
                "T\u00FCrk\u00E7esiyaz\u0131", "dilinin", "kayna\u011F\u0131", "olarak",
                "kabul", "edilir", "yaz\u0131", "dili", "bua\u011F\u0131z", "temelinde",
                "olu\u015Fmu\u015Ftur" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
        assertResult(new String[] { "istanbul", "a\u011Fz\u0131", "t\u00FCrki", "T\u00FCrkiye",
                "t\u00FCrk\u00E7esiyaz", "T\u00FCrk\u00E7esiyaz\u0131", "dil", "dilinin",
                "kaynak", "kayna\u011F\u0131", "olarak", "kabul", "edilir", "yaz", "yaz\u0131",
                "dil", "dili", "buak", "bua\u011F\u0131z", "temel", "temelinde", "olu\u015F",
                "olu\u015Fmu\u015Ftur" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
        assertResult(new String[] { "istanbul", "a\u011Fz\u0131", "t\u00FCrki", "T\u00FCrkiye",
                "t\u00FCrk\u00E7esiyaz", "T\u00FCrk\u00E7esiyaz\u0131", "dil", "dilinin",
                "kaynak", "kayna\u011F\u0131", "kabul", "edilir", "yaz", "yaz\u0131",
                "dil", "dili", "buak", "bua\u011F\u0131z", "temel", "temelinde", "olu\u015F",
                "olu\u015Fmu\u015Ftur" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
    }
    
    /**
     * The DefaultTokenizer has a completely different implementation from the Lucene-base
     * tokenizers (the latter were originally an external plugin, for licensing reasons).
     * It's based on Java's BreakIterator. It warrants testing so that it doesn't get overlooked
     * when changes are made to the other tokenizers.
     */
    public void testDefault() {
        ITokenizer tok = new DefaultTokenizer();
        String orig = "The quick, brown <x0/> jumped over 1 \"lazy\" \u0130stanbul. "
                + "\u65E5\u672C\u8A9E\u3042\u3044\u3046\u3048\u304A\u3002";
        assertVerbatim(new String[] { "The", " ", "quick", ",", " ", "brown", " ", "<x0/>", " ",
                "jumped", " ", "over", " ", "1", " ", "\"", "lazy", "\"", " ", "\u0130stanbul", ".",
                " ", "\u65E5\u672C\u8A9E", "\u3042\u3044\u3046\u3048\u304A", "\u3002" },
                tok.tokenizeVerbatimToStrings(orig),
                tok.tokenizeVerbatim(orig),
                orig);
        assertResult(new String[] { "The", "quick", "brown", "jumped", "over", "lazy", "\u0130stanbul",
                "\u65E5\u672C\u8A9E", "\u3042\u3044\u3046\u3048\u304A" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
        assertResult(new String[] { "The", "quick", "brown", "jumped", "over", "lazy", "\u0130stanbul",
                "\u65E5\u672C\u8A9E", "\u3042\u3044\u3046\u3048\u304A" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
        assertResult(new String[] { "The", "quick", "brown", "jumped", "over", "lazy", "\u0130stanbul",
                "\u65E5\u672C\u8A9E", "\u3042\u3044\u3046\u3048\u304A" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
    }

    private void assertVerbatim(String[] expected, String[] test, Token[] testTok, String origString) {
        assertResult(expected, test);
        assertEquals(StringUtils.join(expected), StringUtils.join(test));
        assertEquals(expected.length, testTok.length);
        for (int i = 0; i < expected.length; i++) {
            assertEquals(expected[i], testTok[i].getTextFromString(origString));
        }
    }
    
    private void assertResult(String[] expected, String[] test) {
//        for (String s : test) {
//            System.out.print('"');
//            System.out.print(s.replace("\"", "\\\""));
//            System.out.print("\", ");
//        }
//        System.out.print('\n');
        assertEquals(expected.length, test.length);
        for (int i = 0; i < expected.length; i++) {
            assertEquals(expected[i], test[i]);
        }
    }
}
