/*
 * Decompiled with CFR 0.152.
 */
package pty.io;

import fig.basic.IOUtils;
import fig.basic.LogInfo;
import fig.basic.Option;
import goblin.Taxon;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import nuts.io.IO;
import nuts.lang.StringUtils;
import nuts.util.Counter;
import pty.io.Dataset;
import pty.io.WalsAnn;

public class HGDPDataset
implements Dataset {
    @Option
    public static String path = "data/hgdp/hgdp.ie.phylip";
    @Option
    public static int maxIndividualPerPopulation = 1;
    @Option
    public static ArrayList<String> restrictToPopulations = new ArrayList();
    @Option
    public static boolean usePhylipNameTruncations = false;
    @Option
    public static int maxNSites = Integer.MAX_VALUE;
    @Option
    public static boolean filterSNPs = false;
    @Option
    public static double fractionToFilterOut = 0.9;
    Map<Taxon, double[][]> obs = new HashMap<Taxon, double[][]>();
    Map<Taxon, String> clu = new HashMap<Taxon, String>();
    Map<Taxon, Integer> popsizes = new HashMap<Taxon, Integer>();
    private int nChars = 2;
    private int nSites = -1;
    public static final Pattern POPULATION_PATTERN = Pattern.compile("^([^\\s]*)\\s+.*");
    public static final Pattern SITE_PATTERN = Pattern.compile("^[^\\s]*\\s+([0-9].*)$");
    public static final Pattern PHYLIP_CONTENT_LINE_PATTERN = Pattern.compile("^[^\\s]*\\s+[0-9].*$");
    public static final Pattern HEADERLINE = Pattern.compile("^([0-9]+\\s*)*$");

    public HGDPDataset() {
        int cIndInPop = 0;
        String prevPop = null;
        HashSet<String> allPops = new HashSet<String>();
        for (String line : IO.i(path)) {
            if (HEADERLINE.matcher(line).matches() || !PHYLIP_CONTENT_LINE_PATTERN.matcher(line).matches()) continue;
            String pop = StringUtils.selectFirstRegex(POPULATION_PATTERN, line);
            allPops.add(pop);
            if (!pop.equals(prevPop)) {
                cIndInPop = 0;
            }
            String codeline = StringUtils.selectFirstRegex(SITE_PATTERN, line);
            String[] codes = codeline.split("\\s+");
            int curNSites = Math.min(codes.length, maxNSites);
            double[][] codeArray = new double[curNSites][2];
            if (this.nSites == -1) {
                this.nSites = curNSites;
            } else if (this.nSites != curNSites) {
                throw new RuntimeException();
            }
            for (int i = 0; i < this.nSites; ++i) {
                try {
                    codeArray[i][0] = Double.parseDouble(codes[i]);
                    codeArray[i][1] = 1.0 - Double.parseDouble(codes[i]);
                    continue;
                }
                catch (Exception e) {
                    System.out.println("Code = " + pop + "," + i + "," + codes[i] + "Prefix = " + codes[i - 2] + "\t" + codes[i - 1]);
                }
            }
            Taxon lang = this.language(pop, cIndInPop);
            if (cIndInPop + 1 <= maxIndividualPerPopulation && this.popIncluded(pop)) {
                this.obs.put(lang, codeArray);
                this.clu.put(lang, pop);
            }
            ++cIndInPop;
            prevPop = pop;
        }
        this.checkPops(allPops);
        if (filterSNPs) {
            this.obs = this.filterSNPs(this.obs);
        }
    }

    private Map<Taxon, double[][]> filterSNPs(Map<Taxon, double[][]> obs) {
        ArrayList<Integer> indicesToKeep = new ArrayList<Integer>();
        this.readPopulationSizes("data/hgdp/popSizes.txt");
        Counter<Integer> c = new Counter<Integer>();
        for (int s = 0; s < this.nSites(); ++s) {
            c.setCount(s, this.anovaTestStatistics(s));
        }
        int requiredSites = (int)((1.0 - fractionToFilterOut) * (double)this.nSites());
        if (requiredSites < 1 || requiredSites >= this.nSites) {
            throw new RuntimeException();
        }
        int cSite = 0;
        Iterator iterator = c.iterator();
        while (iterator.hasNext()) {
            int s = (Integer)iterator.next();
            indicesToKeep.add(s);
            if (++cSite < requiredSites) continue;
            break;
        }
        HashMap<Taxon, double[][]> newObs = new HashMap<Taxon, double[][]>();
        for (Taxon lang : obs.keySet()) {
            double[][] cur = new double[requiredSites][];
            double[][] model = obs.get(lang);
            for (int i = 0; i < requiredSites; ++i) {
                cur[i] = model[(Integer)indicesToKeep.get(i)];
            }
            newObs.put(lang, cur);
        }
        this.nSites = requiredSites;
        return newObs;
    }

    private Taxon language(String pop, int index) {
        if (usePhylipNameTruncations) {
            pop = WalsAnn.cleanForPhylip(pop).replaceAll(" ", "");
        }
        return new Taxon(maxIndividualPerPopulation == 1 ? pop : pop + "-" + index);
    }

    private void checkPops(Set<String> allPops) {
        for (String pop : restrictToPopulations) {
            if (allPops.contains(pop)) continue;
            LogInfo.warning("Population not recognized:" + pop);
        }
    }

    private boolean popIncluded(String lang) {
        if (restrictToPopulations.size() == 0) {
            return true;
        }
        return restrictToPopulations.contains(lang);
    }

    @Override
    public Map<Taxon, String> getReferenceClusters() {
        return this.clu;
    }

    @Override
    public boolean hasReferenceClusters() {
        return true;
    }

    @Override
    public Map<Taxon, double[][]> observations() {
        return this.obs;
    }

    @Override
    public int nCharacter(int site) {
        return this.nChars;
    }

    @Override
    public int nSites() {
        return this.nSites;
    }

    public void readPopulationSizes(String file) {
        try {
            String s;
            BufferedReader br = new BufferedReader(new FileReader(file));
            while ((s = br.readLine()) != null) {
                StringTokenizer tok = new StringTokenizer(s, " ");
                this.popsizes.put(new Taxon(tok.nextToken()), Integer.parseInt(tok.nextToken()));
            }
            br.close();
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void computeAllPairDistances() {
        Taxon[] keys = new Taxon[this.obs.size()];
        this.obs.keySet().toArray(keys);
        Counter<String> counter1 = new Counter<String>();
        Counter<String> counter2 = new Counter<String>();
        for (int i = 0; i < keys.length; ++i) {
            for (int j = 0; j < keys.length; ++j) {
                if (i == j) continue;
                int n1 = this.popsizes.get(keys[i]);
                int n2 = this.popsizes.get(keys[j]);
                counter1.setCount(keys[i] + "," + keys[j] + "=", this.computePairwiseDistance(this.obs.get(keys[i]), this.obs.get(keys[j])));
                counter2.setCount(keys[i] + "," + keys[j] + "=", this.computeFST(this.obs.get(keys[i]), this.obs.get(keys[j]), n1, n2));
            }
        }
        System.out.println("Squared distance");
        for (String s : counter1) {
            System.out.println(s + counter1.getCount(s));
        }
        System.out.println("FST");
        for (String s : counter2) {
            System.out.println(s + counter2.getCount(s));
        }
    }

    public double anovaTestStatistics(int site) {
        double acrossPopulationsFreq = this.acrossPopulationsFreq(site);
        if (acrossPopulationsFreq == 0.0) {
            return 0.0;
        }
        double numerator = 0.0;
        double totalPopSize = 0.0;
        for (Taxon lang : this.obs.keySet()) {
            double currentPopSize = this.popsizes.get(lang).intValue();
            double currentFreq = this.obs.get(lang)[site][0];
            totalPopSize += currentPopSize;
            numerator += currentPopSize * (currentFreq - acrossPopulationsFreq) * (currentFreq - acrossPopulationsFreq);
        }
        return numerator / totalPopSize / acrossPopulationsFreq;
    }

    private double acrossPopulationsFreq(int site) {
        double num = 0.0;
        double denom = 0.0;
        for (Taxon lang : this.obs.keySet()) {
            double f = this.obs.get(lang)[site][0];
            double n = this.popsizes.get(lang).intValue();
            num += f * n;
            denom += n;
        }
        return num / denom;
    }

    public double computePairwiseDistance(double[][] obs1, double[][] obs2) {
        double distance = 0.0;
        for (int i = 0; i < obs1.length; ++i) {
            distance += Math.pow(obs1[i][0] - obs2[i][0], 2.0);
        }
        return distance /= (double)obs1.length;
    }

    public double computeFST(double[][] obs1, double[][] obs2, int n1, int n2) {
        int n = n1 + n2;
        int m1 = n1 * (n1 - 1) / 2;
        int m2 = n2 * (n2 - 1) / 2;
        int norm = m1 + m2;
        double f1 = 0.0;
        double f2 = 0.0;
        double f = 0.0;
        for (int i = 0; i < obs1.length; ++i) {
            f1 += obs1[i][0] * (1.0 - obs1[i][0]);
            f2 += obs2[i][0] * (1.0 - obs2[i][0]);
            double p = ((double)n1 * obs1[i][0] + (double)n2 * obs2[i][0]) / (double)n;
            f += p * (1.0 - p);
        }
        f1 = f1 * (2.0 * (double)n1 / (double)(n1 - 1)) * ((double)m1 / (double)norm);
        f2 = f2 * (2.0 * (double)n2 / (double)(n2 - 1)) * ((double)m2 / (double)norm);
        double fst = 1.0 - (f1 + f2) / (f *= 2.0 * (double)n / (double)(n - 1));
        return fst;
    }

    public static void main(String[] args) {
        HGDPDataset hgdp = new HGDPDataset();
        hgdp.readPopulationSizes("data/hgdp/popsizes.IE.txt");
        hgdp.computeAllPairDistances();
    }

    public static class PrintPruned {
        public static void main(String[] args) {
            path = "data/hgdp/frequency.pops-all.chr-all.snps-sampled.txt";
            HGDPDataset hgdp = new HGDPDataset();
            Map<Taxon, double[][]> data = hgdp.observations();
            PrintWriter out = IOUtils.openOutHard("data/hgdp/forcontml.phylip");
            out.println("   " + data.keySet().size() + " " + hgdp.nSites());
            for (int i = 0; i < hgdp.nSites(); ++i) {
                out.print("2 ");
            }
            out.print("\n");
            for (Taxon lang : data.keySet()) {
                out.print(WalsAnn.cleanForPhylip(lang.toString()) + " ");
                double[][] datum = data.get(lang);
                for (int s = 0; s < datum.length; ++s) {
                    out.print("" + datum[s][0] + " ");
                }
                out.print("\n");
            }
            out.close();
        }
    }

    public static class PrintSiteFsts {
        public static void main(String[] args) {
            path = "data/hgdp/frequency.pops-all.chr-all.snps-sampled.txt";
            HGDPDataset hgdp = new HGDPDataset();
            hgdp.readPopulationSizes("data/hgdp/popSizes.txt");
            Counter<Integer> c = new Counter<Integer>();
            for (int s = 0; s < hgdp.nSites(); ++s) {
                c.setCount(s, hgdp.anovaTestStatistics(s));
            }
            Iterator iterator = c.iterator();
            while (iterator.hasNext()) {
                int s = (Integer)iterator.next();
                System.out.println(c.getCount(s) + "\t" + hgdp.acrossPopulationsFreq(s));
            }
        }
    }
}

