/*
 * Decompiled with CFR 0.152.
 */
package pty.io;

import fig.basic.LogInfo;
import fig.basic.Option;
import fig.basic.Pair;
import fig.exec.Execution;
import goblin.Taxon;
import java.io.File;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import nuts.io.IO;
import nuts.util.CollUtils;
import nuts.util.Counter;
import nuts.util.CounterMap;
import nuts.util.Indexer;
import pty.ObservationDimensions;
import pty.io.Dataset;
import pty.io.WalsAnn;
import pty.io.WalsProcessingScript;
import pty.smc.MapLeaves;

public class WalsDataset
implements Dataset {
    @Option
    public static String scriptPath = "/Users/bouchard/Documents/workspace/evolvere/data/wals-preprocessing-script";
    @Option
    public static boolean useWalsCodeForLanguages = true;
    @Option
    public static String walsPath = "data/wals_data";
    @Option
    public static String languageFamilyRestriction = "Indo-European";
    @Option
    public static ArrayList<String> languageListRestriction = new ArrayList();
    @Option
    public static double sitesFractionThreshold = 0.25;
    @Option
    public static double charsFractionThreshold = 0.25;
    @Option
    public static int languageCountThreshold = 10;
    @Option
    public static boolean useFamilyAsRef = false;
    @Option
    public static ArrayList<WalsCorpusOperation> preprocessingSteps = new ArrayList<WalsCorpusOperation>(Arrays.asList(WalsCorpusOperation.FAMILY_RESTRICT, WalsCorpusOperation.UNDERDOCUMENTED_LANGS, WalsCorpusOperation.UNDERUSED_SITES, WalsCorpusOperation.BINARIZE, WalsCorpusOperation.REMOVE_DEGENERATE_SITES));
    private final Map<Pair<Taxon, Site>, BioCharacter> data;
    private transient Indexer<Site> _siteIndexer = null;
    private transient Map<Site, Indexer<BioCharacter>> _charIndexers = null;
    public static LanguageDatabase langDB = null;
    private transient Set<Taxon> _allLanguages = null;
    private transient Set<Site> _allSites = null;
    private transient Map<Site, Set<BioCharacter>> _allCharacters = new HashMap<Site, Set<BioCharacter>>();
    private transient Map<Taxon, Set<Site>> _knownCharacter = new HashMap<Taxon, Set<Site>>();

    public static WalsDataset getPreprocessedCorpus() {
        WalsDataset initial = null;
        LogInfo.track((Object)"Preprocessing WALS", true);
        Parser parser = new Parser(new File(walsPath));
        initial = parser.getDataset();
        LogInfo.logs("Initial: " + initial.summary());
        WalsDataset result = WalsDataset.getProcessedDataset(initial, preprocessingSteps);
        LogInfo.end_track();
        return result;
    }

    @Override
    public Map<Taxon, double[][]> observations() {
        return Dataset.DatasetUtils.convert(this.toObservationArrays(-1), (ObservationDimensions)this);
    }

    @Override
    public boolean hasReferenceClusters() {
        return true;
    }

    @Override
    public Map<Taxon, String> getReferenceClusters() {
        return useFamilyAsRef ? langDB.familyMap() : langDB.genusMap();
    }

    private WalsDataset(Map<Pair<Taxon, Site>, BioCharacter> data) {
        this.data = CollUtils.archive(data);
    }

    public String summary() {
        return "" + this.allLanguages().size() + " languages and " + this.allSites().size() + " sites";
    }

    public String toString(Taxon lang) {
        StringBuilder result = new StringBuilder();
        if (!this.allLanguages().contains(lang)) {
            throw new RuntimeException();
        }
        result.append(lang + "\n");
        for (Site site : this.knownSites(lang)) {
            result.append("\t" + site + "\t" + this.get(lang, site) + "\n");
        }
        return result.toString();
    }

    public String toString() {
        StringBuilder result = new StringBuilder();
        result.append(this.summary() + "\n");
        for (Taxon lang : this.allLanguages()) {
            result.append(this.toString(lang));
        }
        return result.toString();
    }

    public String toPhylip() {
        int unk = -1;
        Map<Taxon, int[]> arrays = this.toObservationArrays(-1);
        StringBuilder result = new StringBuilder();
        result.append("" + this.allLanguages().size() + " " + this.allSites().size() + "\n");
        for (Taxon lang : arrays.keySet()) {
            String langCode = langDB.findWalsCode(lang);
            result.append(WalsAnn.cleanForPhylip(langCode));
            for (int site : arrays.get(lang)) {
                if (site == -1) {
                    result.append("?");
                    continue;
                }
                result.append(site);
            }
            result.append("\n");
        }
        return result.toString();
    }

    private Map<Taxon, int[]> toObservationArrays(int unknownCode) {
        HashMap<Taxon, int[]> result = new HashMap<Taxon, int[]>();
        Indexer<Site> siteIndexer = this.siteIndexer();
        Map<Site, Indexer<BioCharacter>> charIndexers = this.charIndexers();
        for (Taxon lang : this.allLanguages()) {
            int[] current = new int[siteIndexer.size()];
            for (int s = 0; s < current.length; ++s) {
                int charIndex;
                Site site = siteIndexer.i2o(s);
                current[s] = !this.isKnown(lang, site) ? unknownCode : (charIndex = charIndexers.get(site).o2i(this.get(lang, site)));
            }
            result.put(lang, current);
        }
        return result;
    }

    public Indexer<Site> siteIndexer() {
        if (this._siteIndexer != null) {
            return this._siteIndexer;
        }
        ArrayList<Site> sorted = new ArrayList<Site>(this.allSites());
        Collections.sort(sorted);
        this._siteIndexer = new Indexer<Site>(sorted);
        return this._siteIndexer;
    }

    public Map<Site, Indexer<BioCharacter>> charIndexers() {
        if (this._charIndexers != null) {
            return this._charIndexers;
        }
        this._charIndexers = new HashMap<Site, Indexer<BioCharacter>>();
        for (Site site : this.allSites()) {
            ArrayList<BioCharacter> sorted = new ArrayList<BioCharacter>(this.allCharacters(site));
            try {
                Collections.sort(sorted);
            }
            catch (Exception e) {
                System.out.println(e);
            }
            this._charIndexers.put(site, new Indexer<BioCharacter>(sorted));
        }
        return this._charIndexers;
    }

    public static WalsProcessingScript getScript() {
        return new WalsProcessingScript(new File(scriptPath));
    }

    public static Set<Taxon> getFamilyRestriction() {
        if (languageFamilyRestriction == null || languageFamilyRestriction.equals("")) {
            return langDB.family.keySet();
        }
        HashSet<Taxon> result = new HashSet<Taxon>();
        for (Taxon lang : langDB.family.keySet()) {
            if (!((String)langDB.family.get(lang)).equals(languageFamilyRestriction)) continue;
            result.add(lang);
        }
        return result;
    }

    public static Set<Taxon> getListRestriction() {
        if (languageListRestriction.size() == 0) {
            return langDB.family.keySet();
        }
        HashSet<Taxon> result = new HashSet<Taxon>();
        for (String langStr : languageListRestriction) {
            Taxon cur = new Taxon(langStr);
            if (!langDB.family.keySet().contains(cur)) {
                throw new RuntimeException();
            }
            result.add(cur);
        }
        return result;
    }

    public static void loadLanguageDatabase(File walsDir) {
        if (langDB != null) {
            return;
        }
        langDB = new LanguageDatabase(new File(walsDir, "languages.tab"));
    }

    public static WalsDataset languageRestrict(WalsDataset dataset, Set<Taxon> restr) {
        HashMap<Pair<Taxon, Site>, BioCharacter> newData = new HashMap<Pair<Taxon, Site>, BioCharacter>();
        for (Pair<Taxon, Site> key : dataset.data.keySet()) {
            if (!restr.contains(key.getFirst())) continue;
            newData.put(key, dataset.data.get(key));
        }
        return new WalsDataset(newData);
    }

    public static void main(String[] args) {
        Execution.run(args, new WalsDatasetApp(), "wals", WalsDataset.class);
    }

    public static WalsDataset getProcessedDataset(WalsDataset initial, List<WalsCorpusOperation> list) {
        WalsDataset current = initial;
        for (WalsCorpusOperation op : list) {
            current = op.apply(current);
            LogInfo.logs("After " + op.summary() + ": " + current.summary());
        }
        return current;
    }

    public static <S, T> CounterMap<S, T> toCounterMap(Counter<Pair<S, T>> c) {
        CounterMap<S, T> result = new CounterMap<S, T>();
        for (Pair<S, T> key : c.keySet()) {
            result.setCount(key.getFirst(), key.getSecond(), c.getCount(key));
        }
        return result;
    }

    public Counter<Pair<Site, BioCharacter>> getBioCharacterCounts() {
        Counter<Pair<Site, BioCharacter>> result = new Counter<Pair<Site, BioCharacter>>();
        for (Pair<Taxon, Site> key : this.data.keySet()) {
            result.incrementCount(Pair.makePair(key.getSecond(), this.data.get(key)), 1.0);
        }
        return result;
    }

    public Counter<Site> getSiteCounts() {
        Counter<Site> result = new Counter<Site>();
        for (Pair<Taxon, Site> key : this.data.keySet()) {
            result.incrementCount(key.getSecond(), 1.0);
        }
        return result;
    }

    public Set<Taxon> allLanguages() {
        if (this._allLanguages != null) {
            return this._allLanguages;
        }
        this._allLanguages = new HashSet<Taxon>();
        for (Pair<Taxon, Site> key : this.data.keySet()) {
            this._allLanguages.add(key.getFirst());
        }
        return this._allLanguages;
    }

    public Set<Site> allSites() {
        if (this._allSites != null) {
            return this._allSites;
        }
        this._allSites = new HashSet<Site>();
        for (Pair<Taxon, Site> key : this.data.keySet()) {
            this._allSites.add(key.getSecond());
        }
        return this._allSites;
    }

    public Set<BioCharacter> allCharacters(Site site) {
        if (!this.allSites().contains(site)) {
            throw new RuntimeException();
        }
        if (this._allCharacters.get(site) != null) {
            return this._allCharacters.get(site);
        }
        for (Pair<Taxon, Site> key : this.data.keySet()) {
            CollUtils.getNoNullSet(this._allCharacters, key.getSecond()).add(this.data.get(key));
        }
        return this._allCharacters.get(site);
    }

    public boolean isKnown(Taxon lang, Site site) {
        if (!this.allLanguages().contains(lang) || !this.allSites().contains(site)) {
            throw new RuntimeException();
        }
        return this.data.containsKey(Pair.makePair(lang, site));
    }

    public Set<Site> knownSites(Taxon lang) {
        if (!this.allLanguages().contains(lang)) {
            throw new RuntimeException();
        }
        if (this._knownCharacter.containsKey(lang)) {
            return this._knownCharacter.get(lang);
        }
        HashSet<Site> result = new HashSet<Site>();
        for (Pair<Taxon, Site> key : this.data.keySet()) {
            if (!key.getFirst().equals(lang)) continue;
            result.add(key.getSecond());
        }
        this._knownCharacter.put(lang, result);
        return result;
    }

    public BioCharacter get(Taxon lang, Site site) {
        if (!this.isKnown(lang, site)) {
            throw new RuntimeException();
        }
        return this.data.get(Pair.makePair(lang, site));
    }

    @Override
    public int nCharacter(int site) {
        int result = this.allCharacters(this.siteIndexer().i2o(site)).size();
        return result;
    }

    @Override
    public int nSites() {
        return this.allSites().size();
    }

    public static class BioCharacter
    implements Serializable,
    Comparable<BioCharacter> {
        private static final long serialVersionUID = 1L;
        private final String string;
        public final int index;

        public BioCharacter(String string) {
            this(string, -1);
        }

        public BioCharacter(String string, int index) {
            if (string == null || string.equals("")) {
                throw new RuntimeException("Name of character should be nontrivial");
            }
            this.string = string;
            this.index = index;
        }

        public boolean equals(Object obj) {
            if (!(obj instanceof BioCharacter)) {
                return false;
            }
            return this.string.equals(((BioCharacter)obj).string);
        }

        public int hashCode() {
            return this.string.hashCode();
        }

        public String toString() {
            return this.string;
        }

        @Override
        public int compareTo(BioCharacter arg0) {
            return this.string.compareTo(arg0.string);
        }
    }

    public static class Site
    implements Serializable,
    Comparable<Site> {
        private static final long serialVersionUID = 1L;
        private final String string;

        public Site(String string) {
            if (string == null || string.equals("")) {
                throw new RuntimeException("Name of site should be nontrivial");
            }
            this.string = string;
        }

        public boolean equals(Object obj) {
            if (!(obj instanceof Site)) {
                return false;
            }
            return this.string.equals(((Site)obj).string);
        }

        public int hashCode() {
            return this.string.hashCode();
        }

        public String toString() {
            return this.string;
        }

        @Override
        public int compareTo(Site arg0) {
            return this.string.compareTo(arg0.string);
        }
    }

    public static enum WalsCorpusOperation {
        FAMILY_RESTRICT{

            @Override
            public String summary() {
                return super.toString() + "(" + languageFamilyRestriction + ")";
            }

            @Override
            public WalsDataset apply(WalsDataset dataset) {
                return WalsDataset.languageRestrict(dataset, WalsDataset.getFamilyRestriction());
            }
        }
        ,
        LIST_RESTRICT{

            @Override
            public String summary() {
                return super.toString() + "(" + languageListRestriction + ")";
            }

            @Override
            public WalsDataset apply(WalsDataset dataset) {
                return WalsDataset.languageRestrict(dataset, WalsDataset.getListRestriction());
            }
        }
        ,
        UNDERDOCUMENTED_LANGS{

            @Override
            public String summary() {
                return super.toString() + "(" + languageCountThreshold + ")";
            }

            @Override
            public WalsDataset apply(WalsDataset dataset) {
                HashSet<Taxon> restr = new HashSet<Taxon>();
                for (Taxon lang : dataset.allLanguages()) {
                    if (dataset.knownSites(lang).size() <= languageCountThreshold) continue;
                    restr.add(lang);
                }
                return WalsDataset.languageRestrict(dataset, restr);
            }
        }
        ,
        UNDERUSED_CHARS{

            @Override
            public String summary() {
                return super.toString() + "(" + charsFractionThreshold + ")";
            }

            @Override
            public WalsDataset apply(WalsDataset dataset) {
                double nLangs = dataset.allLanguages().size();
                Counter<Pair<Site, BioCharacter>> charCounter = dataset.getBioCharacterCounts();
                HashMap newData = new HashMap();
                for (Pair key : dataset.data.keySet()) {
                    BioCharacter value = (BioCharacter)dataset.data.get(key);
                    Pair key2 = Pair.makePair(key.getSecond(), value);
                    if (!(charCounter.getCount(key2) / nLangs > charsFractionThreshold)) continue;
                    newData.put(key, dataset.data.get(key));
                }
                return new WalsDataset(newData);
            }
        }
        ,
        SCRIPT{

            @Override
            public WalsDataset apply(WalsDataset dataset) {
                HashMap<Pair, BioCharacter> newData = new HashMap<Pair, BioCharacter>();
                WalsProcessingScript script = WalsDataset.getScript();
                for (Pair key : dataset.data.keySet()) {
                    Site site = (Site)key.getSecond();
                    Taxon taxon = (Taxon)key.getFirst();
                    if (script.ignore(taxon) || script.ignore(site)) continue;
                    BioCharacter value = (BioCharacter)dataset.data.get(key);
                    if (script.shouldTranslate(site)) {
                        for (Pair<Site, BioCharacter> translated : script.translate(Pair.makePair(site, value))) {
                            newData.put(Pair.makePair(taxon, translated.getFirst()), translated.getSecond());
                        }
                        continue;
                    }
                    newData.put(key, value);
                }
                return new WalsDataset(newData);
            }

            @Override
            public String summary() {
                return this.toString();
            }
        }
        ,
        ENCODE_BIN{
            public final BioCharacter ZERO = new BioCharacter("ZERO");
            public final BioCharacter ONE = new BioCharacter("ONE");

            @Override
            public WalsDataset apply(WalsDataset dataset) {
                HashMap newData = new HashMap();
                for (Pair key : dataset.data.keySet()) {
                    BioCharacter value = (BioCharacter)dataset.data.get(key);
                    for (BioCharacter otherValue : dataset.allCharacters((Site)key.getSecond())) {
                        Site newSite = new Site("" + key.getSecond() + "=" + otherValue);
                        Pair newKey = Pair.makePair(key.getFirst(), newSite);
                        if (value.equals(otherValue)) {
                            newData.put(newKey, this.ONE);
                            continue;
                        }
                        newData.put(newKey, this.ZERO);
                    }
                }
                return new WalsDataset(newData);
            }

            @Override
            public String summary() {
                return this.toString();
            }
        }
        ,
        BINARIZE{
            public final BioCharacter SET_OF_COLLAPSED_NON_MODE_VALUES = new BioCharacter("SET_OF_COLLAPSED_NON_MODE_VALUES");

            @Override
            public WalsDataset apply(WalsDataset dataset) {
                CounterMap charCounter = WalsDataset.toCounterMap(dataset.getBioCharacterCounts());
                HashMap<Pair, BioCharacter> newData = new HashMap<Pair, BioCharacter>();
                for (Pair key : dataset.data.keySet()) {
                    BioCharacter value = (BioCharacter)dataset.data.get(key);
                    if (((BioCharacter)charCounter.getCounter(key.getSecond()).argMax()).equals(value)) {
                        newData.put(key, value);
                        continue;
                    }
                    newData.put(key, this.SET_OF_COLLAPSED_NON_MODE_VALUES);
                }
                return new WalsDataset(newData);
            }

            @Override
            public String summary() {
                return this.toString();
            }
        }
        ,
        REMOVE_DEGENERATE_SITES{

            @Override
            public WalsDataset apply(WalsDataset dataset) {
                CounterMap charCounter = WalsDataset.toCounterMap(dataset.getBioCharacterCounts());
                HashMap newData = new HashMap();
                for (Pair key : dataset.data.keySet()) {
                    if (charCounter.getCounter(key.getSecond()).size() < 2) continue;
                    newData.put(key, dataset.data.get(key));
                }
                return new WalsDataset(newData);
            }

            @Override
            public String summary() {
                return this.toString();
            }
        }
        ,
        REMOVE_SIGN{

            @Override
            public WalsDataset apply(WalsDataset dataset) {
                HashMap newData = new HashMap();
                for (Pair key : dataset.data.keySet()) {
                    if (langDB.familyMap().get(key.getFirst()).equals("other")) continue;
                    newData.put(key, dataset.data.get(key));
                }
                return new WalsDataset(newData);
            }

            @Override
            public String summary() {
                return this.toString();
            }
        }
        ,
        UNDERUSED_SITES{

            @Override
            public String summary() {
                return super.toString() + "(" + sitesFractionThreshold + ")";
            }

            @Override
            public WalsDataset apply(WalsDataset dataset) {
                double nLangs = dataset.allLanguages().size();
                Counter<Site> siteCounter = dataset.getSiteCounts();
                HashMap newData = new HashMap();
                for (Pair key : dataset.data.keySet()) {
                    if (!(siteCounter.getCount((Site)key.getSecond()) / nLangs > sitesFractionThreshold)) continue;
                    newData.put(key, dataset.data.get(key));
                }
                return new WalsDataset(newData);
            }
        };


        public abstract WalsDataset apply(WalsDataset var1);

        public abstract String summary();
    }

    private static class Parser {
        private final Map<Integer, Site> sites = new HashMap<Integer, Site>();
        private final Map<Site, Map<Integer, BioCharacter>> chars = new HashMap<Site, Map<Integer, BioCharacter>>();
        private final Map<Pair<Taxon, Site>, BioCharacter> data = new HashMap<Pair<Taxon, Site>, BioCharacter>();

        public Parser(File file) {
            WalsDataset.loadLanguageDatabase(file);
            this.parseSites(new File(file, "features.tab"));
            this.parseBioCharacters(new File(file, "values.tab"));
            this.parseDataPoints(new File(file, "datapoints.tab"));
        }

        public WalsDataset getDataset() {
            return new WalsDataset(this.data);
        }

        private void parseDataPoints(File file) {
            ArrayList<Integer> featureCodes = null;
            for (String line : IO.i(file)) {
                String[] fields;
                if (!line.matches("^wals_code.*")) {
                    fields = line.split("\\t");
                    Taxon lang = langDB.walsCode2Language(fields[0]);
                    for (int i = 1; i < fields.length; ++i) {
                        if (fields[i].equals("")) continue;
                        int siteId = (Integer)featureCodes.get(i);
                        Site site = this.sites.get(siteId);
                        int bcId = Integer.parseInt(fields[i]);
                        Map<Integer, BioCharacter> cMap = this.chars.get(site);
                        BioCharacter bc = cMap.get(bcId);
                        if (bc == null) {
                            throw new RuntimeException();
                        }
                        this.data.put(Pair.makePair(lang, site), bc);
                    }
                    continue;
                }
                featureCodes = new ArrayList<Integer>();
                fields = line.split("\\t");
                featureCodes.add(null);
                for (int i = 1; i < fields.length; ++i) {
                    featureCodes.add(Integer.parseInt(fields[i]));
                }
            }
        }

        private void parseSites(File file) {
            for (String line : IO.i(file)) {
                if (line.matches("^id.*")) continue;
                String[] fields = line.split("\\t");
                int site = Integer.parseInt(fields[0]);
                this.sites.put(site, new Site(fields[1]));
            }
        }

        private void parseBioCharacters(File file) {
            for (String line : IO.i(file)) {
                if (line.matches("^feature_id.*")) continue;
                String[] fields = line.split("\\t");
                int _site = Integer.parseInt(fields[0]);
                int bc = Integer.parseInt(fields[1]);
                Site site = this.sites.get(_site);
                Map<Integer, BioCharacter> current = this.chars.get(site);
                if (current == null) {
                    current = new HashMap<Integer, BioCharacter>();
                    this.chars.put(site, current);
                }
                current.put(bc, new BioCharacter(fields[2], bc - 1));
            }
        }
    }

    public static class WalsDatasetApp
    implements Runnable {
        @Override
        public void run() {
            preprocessingSteps = new ArrayList<WalsCorpusOperation>(Arrays.asList(WalsCorpusOperation.SCRIPT, WalsCorpusOperation.UNDERDOCUMENTED_LANGS));
            WalsDataset dataset = WalsDataset.getPreprocessedCorpus();
            IO.so(dataset.toPhylip());
            MapLeaves ml = MapLeaves.parse("data/world-language-gene-map.txt");
            Set<Taxon> restr = ml.getLanguageGeneMap().keySet();
            HashMap<Taxon, String> labels = new HashMap<Taxon, String>(langDB.genusMap());
            Map<String, Set<Taxon>> inv = CollUtils.invert(labels);
            for (String str : inv.keySet()) {
                System.out.println("" + str + ":\t" + inv.get(str));
            }
            System.out.println("---");
            labels = new HashMap<Taxon, String>(langDB.familyMap());
            inv = CollUtils.invert(labels);
            for (String str : inv.keySet()) {
                System.out.println("" + str + ":\t" + inv.get(str));
            }
        }
    }

    public static class LanguageDatabase {
        private final Map<Taxon, String> genus = new HashMap<Taxon, String>();
        private final Map<Taxon, String> family = new HashMap<Taxon, String>();
        private final Map<String, Taxon> walsCode2Language = new HashMap<String, Taxon>();

        public Taxon walsCode2Language(String wals) {
            return this.walsCode2Language.get(wals);
        }

        public Map<Taxon, String> genusMap() {
            return Collections.unmodifiableMap(this.genus);
        }

        public Map<Taxon, String> familyMap() {
            return Collections.unmodifiableMap(this.family);
        }

        private LanguageDatabase(File languageFile) {
            for (String line : IO.i(languageFile)) {
                if (line.matches("^wals.*")) continue;
                String[] fields = line.split("\\t");
                String walsCode = fields[0];
                Taxon lang = new Taxon(useWalsCodeForLanguages ? walsCode : WalsAnn.cleanedLangName(fields[1], false));
                this.genus.put(lang, fields[4]);
                this.family.put(lang, fields[5]);
                this.walsCode2Language.put(walsCode, lang);
            }
        }

        public String findWalsCode(Taxon lang) {
            for (String walsCode : this.walsCode2Language.keySet()) {
                if (!this.walsCode2Language.get(walsCode).equals(lang)) continue;
                return walsCode;
            }
            return null;
        }
    }
}

