/*
 * Decompiled with CFR 0.152.
 */
package goblin;

import fig.basic.LogInfo;
import fig.basic.Option;
import fig.exec.Execution;
import fig.prob.SampleUtils;
import goblin.CognateId;
import goblin.CognateSet;
import goblin.DataPrepUtils;
import goblin.DerivationTree;
import goblin.EMMain;
import goblin.Heldout;
import goblin.ObservationsTracker;
import goblin.ParamsTracker;
import goblin.ParamsTrackers;
import goblin.Taxon;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import nuts.math.MeasureZeroException;
import nuts.util.Arbre;
import nuts.util.Tree;
import pepper.Corpus;
import pepper.editmodel.TreeSampler;
import pepper.editmodel.Utils;

public class PrepareEMData
implements Runnable {
    public static final String GENERATE = "!generate";
    @Option
    public String paramsPath;
    @Option
    public String topo;
    @Option
    public String wordsPath;
    @Option
    public String heldoutLang = null;
    @Option
    public double heldoutProp = 0.2;
    @Option
    public ArrayList<String> languagesToForget = new ArrayList();
    @Option(gloss="Max number of words in training + test")
    public int maxNumberOfWords = 2000;
    @Option
    Random rand = new Random(1L);
    private ParamsTracker params;
    private Tree<String> topology;
    private Corpus corpus;
    private double nValid;
    private double nUsed;
    private double nValidHeldOut;
    private double heldoutSize;
    private Heldout heldout;
    private CognateSet cognates = new CognateSet();
    public static final String cognateSetOutput = "initCognateSet.seri";
    public static final String heldoutOutput = "heldout.seri";
    private String trueReconstruction = null;

    public boolean isCorpusPathDefined() {
        return this.wordsPath != null && !this.wordsPath.equals(GENERATE);
    }

    private void getData() throws IOException, ClassNotFoundException {
        this.params = ParamsTrackers.restoreParamsTracker(this.paramsPath);
        LogInfo.logss("Encodings:\n" + this.params.getEncodings().toString());
        this.topology = DataPrepUtils.lisp2tree(this.topo);
        LogInfo.logss("Topology: " + this.topology.toString());
        this.corpus = this.wordsPath.equals(GENERATE) ? this.generateWords() : PrepareEMData.loadCorpus(this.wordsPath, this.topo);
        this.corpus = Corpus.forget(this.corpus, new HashSet<String>(this.languagesToForget));
        this.computeGlobalStats();
    }

    private Corpus generateWords() {
        ArrayList<List<String>> words = new ArrayList<List<String>>();
        List<Tree<String>> langTreeNames = this.topology.getPreOrderTraversal();
        ArrayList<String> langNames = new ArrayList<String>();
        for (Tree<String> tree : langTreeNames) {
            langNames.add(tree.getLabel());
        }
        for (int w = 0; w < this.maxNumberOfWords; ++w) {
            words.add(DataPrepUtils.generate(this.rand, this.topology, this.params));
        }
        return new Corpus(langNames, words);
    }

    private static Corpus loadCorpus(String path, String topo) throws IOException {
        Corpus corpus = Corpus.parse(path);
        corpus = Corpus.restrict(corpus, new HashSet<String>(PrepareEMData.nodes(DataPrepUtils.lisp2tree(topo))));
        return corpus;
    }

    public static <T> List<T> nodes(Tree<T> tree) {
        ArrayList<T> result = new ArrayList<T>();
        result.add(tree.getLabel());
        for (Tree<T> child : tree.getChildren()) {
            result.addAll(PrepareEMData.nodes(child));
        }
        return result;
    }

    private void computeGlobalStats() {
        LogInfo.logss("Initial number of rows: " + this.corpus.getNWords());
        this.nValid = DataPrepUtils.nValidRows(this.corpus, this.params.getEncodings().allChars());
        Set<Character> unkChars = DataPrepUtils.unknownCharacters(this.corpus, this.params.getEncodings().allChars());
        LogInfo.logss("Number of invalid characters: " + unkChars.size());
        this.nValidHeldOut = this.createHeldout() ? (double)DataPrepUtils.nValidHeldoutRows(this.corpus, this.heldoutLang, this.params.getEncodings().allChars()) : Double.NaN;
        LogInfo.logss("Number of valid rows (those with only known characters): " + this.nValid);
        if (this.createHeldout()) {
            LogInfo.logss("Subset of them that could be used for " + this.heldoutLang + " heldout: " + this.nValidHeldOut);
        }
        this.nUsed = Math.min(this.nValid, (double)this.maxNumberOfWords);
        this.heldoutSize = (int)(this.createHeldout() ? Math.min(this.heldoutProp * this.nUsed, this.nValidHeldOut) : 0.0);
        LogInfo.logss("Subset of them that will be used for heldout: " + this.heldoutSize);
    }

    @Override
    public void run() {
        this.topo = DataPrepUtils.optionallyLoad(this.topo);
        if (this.createHeldout()) {
            this.heldout = new Heldout(new Taxon(this.heldoutLang));
        }
        try {
            this.getData();
            int currentHeldout = 0;
            int currentWord = 0;
            for (int row : SampleUtils.samplePermutation(this.rand, this.corpus.getNWords())) {
                if (DataPrepUtils.unknownCharacters(this.corpus, row, this.params.getEncodings().allChars()).size() > 0) continue;
                if (++currentWord > this.maxNumberOfWords) break;
                Arbre<DerivationTree.DerivationNode> current = DataPrepUtils.tree2arbre(this.topology, this.corpus.getWords(row));
                if ((current = DataPrepUtils.trim(current)).nodes().size() < 2) continue;
                boolean isHeldout = false;
                if (this.createHeldout() && DataPrepUtils.isValidForHeldout(this.corpus, row, this.heldoutLang) && (double)currentHeldout < this.heldoutSize) {
                    isHeldout = true;
                    this.holdout(row, current);
                    ++currentHeldout;
                }
                ObservationsTracker obs = DataPrepUtils.observations(current);
                DataPrepUtils.fillInWords(current, this.rand);
                try {
                    ParamsTrackers.HomoParamsTracker simpleParams = new ParamsTrackers.HomoParamsTracker(TreeSampler.initEditParam(this.params.getEncodings(), null, 1.0, 90.0, 7.0, 2.0, false));
                    Arbre<DerivationTree.DerivationNode> withSampledDerivs = DataPrepUtils.sampleDerivationsUsingObservedSampler(current, simpleParams, this.rand);
                    this.cognates.addCognate(this.cognateId(row), withSampledDerivs, obs);
                    if (!isHeldout) continue;
                    this.commitHoldout(row);
                }
                catch (MeasureZeroException mze) {
                    LogInfo.logss("Impossible to sample an initial derivation for " + this.cognateId(row));
                }
            }
            LogInfo.logss("Actual number of cognates prepared: " + this.cognates.size());
            if (this.heldout != null) {
                LogInfo.logss("Actual number of cognates heldout: " + this.heldout.size());
            }
            this.saveResult();
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    private void saveResult() throws IOException {
        CognateSet.saveCognateSet(this.cognates, PrepareEMData.getCognateSetOutPath());
        Heldout.saveHeldout(this.heldout, PrepareEMData.getHeldoutOutPath());
        EMMain.dumpStat("corpus", this.corpus.toString());
    }

    public static String getCognateSetOutPath() {
        return Utils.safeGetExecFilePath(cognateSetOutput);
    }

    public static String getHeldoutOutPath() {
        return Utils.safeGetExecFilePath(heldoutOutput);
    }

    private void holdout(int rowIndex, Arbre<DerivationTree.DerivationNode> current) {
        Taxon lang = new Taxon(this.heldoutLang);
        Arbre<DerivationTree.DerivationNode> toheldOut = DerivationTree.findNodeByLangName(current, lang);
        this.trueReconstruction = toheldOut.getContents().getWord();
        toheldOut.setContents(new DerivationTree.DerivationNode(lang, null));
    }

    private void commitHoldout(int rowIndex) {
        if (this.trueReconstruction == null) {
            throw new RuntimeException("trueReconstruction should not be null in commitHoldout");
        }
        this.heldout.addWordHeldoutEntry(this.cognateId(rowIndex), this.trueReconstruction);
    }

    private CognateId cognateId(int rowIndex) {
        return new CognateId(this.wordsPath + ":" + rowIndex);
    }

    public boolean createHeldout() {
        return this.heldoutLang != null;
    }

    public static void main(String[] args) {
        Execution.run(args, new PrepareEMData());
    }
}

