/*
 * Decompiled with CFR 0.152.
 */
package ev.io;

import ev.io.RandomOverlappingDataset;
import ev.io.SpeciesInfo;
import fig.basic.IOUtils;
import fig.basic.LogInfo;
import fig.basic.Option;
import fig.exec.Execution;
import goblin.Taxon;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.regex.Pattern;
import ma.MSAPoset;
import ma.MultiAlignment;
import nuts.io.IO;
import nuts.lang.StringUtils;
import nuts.math.Sampling;
import nuts.util.CollUtils;
import nuts.util.Counter;
import nuts.util.Indexer;
import pepper.Encodings;
import pty.RootedTree;

public class PreprocessGutellData {
    public static boolean _processBenchDebugSwitch = false;
    public static Pattern matcherPattern = Pattern.compile("([^ ].*) [0-9]+ (.*)");
    public static Pattern sub1 = Pattern.compile("\\\\[0-9][0-9][0-9]");
    public static Pattern sub2 = Pattern.compile("//");
    public static Pattern sub3 = Pattern.compile("[^ACGU-]");

    public static void createRandomGutellSplit(File outputDirectory, File inputDirectory, File outlierDirectory, int nReplica, int nTaxaPerReplica, int nOutliers, Random rand, int maxLen) {
        outputDirectory.mkdir();
        List<MSAPoset> msas = PreprocessGutellData.randomDataSet(inputDirectory, nReplica, nTaxaPerReplica, rand);
        List<MSAPoset> outlierMsas = null;
        if (nOutliers > 0) {
            outlierMsas = PreprocessGutellData.randomDataSet(outlierDirectory, nReplica, nOutliers, rand);
        }
        for (int i = 0; i < msas.size(); ++i) {
            MSAPoset current = msas.get(i);
            if (maxLen != Integer.MAX_VALUE) {
                current = RandomOverlappingDataset.chunk(current, maxLen);
            }
            if (outlierMsas != null) {
                MSAPoset tojoin = outlierMsas.get(i);
                if (maxLen != Integer.MAX_VALUE) {
                    tojoin = RandomOverlappingDataset.chunk(tojoin, maxLen);
                }
                current = RandomOverlappingDataset.merge(tojoin, current);
            }
            current.toMultiAlignmentObject().saveToMSF(new File(outputDirectory, IO.appendSuffix("gutell-" + i, "msf")));
        }
    }

    public static List<MSAPoset> randomDataSet(File inputFile, int nReplica, int nTaxaPerReplica, Random rand) {
        return PreprocessGutellData.randomDataSet(inputFile, null, nReplica, nTaxaPerReplica, rand);
    }

    public static List<MSAPoset> randomDataSet(File inputFile, File outputDir, int nReplica, int nTaxaPerReplica, Random rand) {
        return PreprocessGutellData.randomDataSet(inputFile, outputDir, nReplica, nTaxaPerReplica, rand, true);
    }

    public static List<MSAPoset> randomDataSet(File inputFile, File outputDir, int nReplica, int nTaxaPerReplica, Random rand, boolean keepOnlyCaps) {
        ArrayList blocks = CollUtils.list();
        HashSet taxonNames = CollUtils.set();
        StringBuilder current = null;
        boolean i = false;
        for (String line : IO.i(inputFile)) {
            if (line.matches("^\\s*$") || line.matches("^[;].*")) continue;
            if (line.matches("[>].*")) {
                if (current != null) {
                    blocks.add(current.toString());
                    current = null;
                }
                String fullTaxonStr = StringUtils.selectFirstRegex("[>](.*)", line);
                String taxonStr = StringUtils.selectFirstRegex("[^:]+[:][:]([^:]+)[:][:].*", fullTaxonStr);
                if (fullTaxonStr.contains("DIVIDER") || fullTaxonStr.contains("synthetic") || taxonNames.contains(taxonStr)) continue;
                taxonNames.add(taxonStr);
                current = new StringBuilder();
                current.append(">" + taxonStr + "\n");
                continue;
            }
            if (!line.matches("[a-zA-Z.-]*") || current == null) continue;
            current.append(line + "\n");
        }
        if (current != null) {
            blocks.add(current.toString());
        }
        ArrayList<MSAPoset> result = CollUtils.list();
        for (int cr = 0; cr < nReplica; ++cr) {
            List<Integer> indices = Sampling.sampleWithoutReplacement(rand, blocks.size(), Math.min(nTaxaPerReplica, blocks.size()));
            File outFile = null;
            try {
                if (outputDir == null) {
                    outFile = File.createTempFile("align" + System.currentTimeMillis(), null);
                    outFile.deleteOnExit();
                } else {
                    outFile = new File(outputDir, "" + cr + ".fasta");
                }
            }
            catch (IOException e) {
                throw new RuntimeException();
            }
            PrintWriter out = IOUtils.openOutHard(outFile);
            for (int index : indices) {
                out.append((CharSequence)blocks.get(index));
            }
            out.close();
            MSAPoset processed = MSAPoset.parseFASTA(outFile);
            if (keepOnlyCaps) {
                if (_processBenchDebugSwitch) {
                    result.add(MSAPoset._processBenchmarkReference(processed));
                    continue;
                }
                result.add(MSAPoset.processBenchmarkReference(processed));
                continue;
            }
            result.add(processed);
        }
        return result;
    }

    public static MultiAlignment parse(String gutellFile, boolean showCharactersDistribution, String genusRestriction) {
        StringBuilder convertedContents = new StringBuilder();
        Set<Taxon> restriction = genusRestriction == null || genusRestriction.equals("") ? null : SpeciesInfo.speciesInClassif(genusRestriction, SpeciesInfo.getEukClassifications());
        LogInfo.logsForce("Converting");
        Counter<Character> unk = new Counter<Character>();
        String previousUncleaned = null;
        String previousCleaned = null;
        for (String line : IO.i(gutellFile)) {
            if (line.charAt(0) == '#') continue;
            List<String> matches = StringUtils.multiSelectFirstRegex(matcherPattern, line);
            String curUncleaned = matches.get(0);
            String curCleaned = null;
            if (curUncleaned.equals(previousUncleaned)) {
                curCleaned = previousCleaned;
            } else {
                previousUncleaned = curUncleaned;
                previousCleaned = curCleaned = PreprocessGutellData.clean(curUncleaned);
            }
            if (restriction != null && !restriction.contains(new Taxon(curCleaned))) continue;
            String seq = matches.get(1);
            seq = sub1.matcher(seq).replaceAll("-");
            seq = sub2.matcher(seq).replaceAll("-");
            seq = seq.toUpperCase();
            seq = sub3.matcher(seq).replaceAll("-");
            if (showCharactersDistribution) {
                for (char c : seq.toCharArray()) {
                    unk.incrementCount(Character.valueOf(c), 1.0);
                }
            }
            convertedContents.append(curCleaned);
            convertedContents.append(" ");
            convertedContents.append(seq);
            convertedContents.append('\n');
        }
        if (showCharactersDistribution) {
            unk.normalize();
            LogInfo.warning("Processed:" + unk);
        }
        LogInfo.logsForce("Creating MSA");
        try {
            return MultiAlignment.parseALNStringToMultiAlignment(convertedContents.toString(), "");
        }
        catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public static String clean(String s) {
        String result = "";
        for (char c : s.toCharArray()) {
            if (!("" + c).matches("[A-Za-z0-9]")) continue;
            result = result + c;
        }
        if (result.length() == 0) {
            throw new RuntimeException("orig:" + s);
        }
        return result;
    }

    public static void main(String[] args) {
        int nRep = 10;
        int nTax = 5;
        int len = 200;
        int outliers = 3;
        Random rand = new Random(1L);
        File outputDir = new File("data/gutell/processed/5S/alignments-10-5");
        outputDir.mkdir();
        List<MSAPoset> msas = PreprocessGutellData.randomDataSet(new File("/Users/bouchard/w/evolvere/data/gutell/raw/5S.3.alnfasta"), outputDir, 10, 5, rand);
        File balistyle = new File("data/gutell/processed/5S/alignments-balistyle-10-5-" + outliers + "-" + 200);
        balistyle.mkdir();
        Indexer<Character> indexer = Encodings.rnaEncodings().nonGapCharactersIndexer();
        int i = 0;
        for (MSAPoset msa : msas) {
            for (int outI = 0; outI < outliers; ++outI) {
                msa = PreprocessGutellData.addOutlier(msa, new Taxon("outlier_" + outI), indexer, rand, 200);
            }
            MultiAlignment convert = msa.toMultiAlignmentObject();
            convert.saveToMSF(new File(balistyle, "" + i + ".msf"));
            RootedTree randomTree = RootedTree.Util.random(rand, msa.taxa());
            IO.writeToDisk(new File(balistyle, "" + i + ".newick"), RootedTree.Util.toNewick(randomTree));
            ++i;
            System.out.println(msa);
        }
    }

    public static MSAPoset addOutlier(MSAPoset msa, Taxon outliername, Indexer<Character> indexer, Random rand, int length) {
        if (msa.sequences().keySet().contains(outliername)) {
            throw new RuntimeException();
        }
        String randomStr = PreprocessGutellData.randomStr(indexer, rand, length);
        HashMap<Taxon, String> newSeqs = CollUtils.map();
        newSeqs.putAll(msa.sequences());
        newSeqs.put(outliername, randomStr);
        MSAPoset result = new MSAPoset(newSeqs);
        for (MSAPoset.Column c : msa.columns()) {
            if (result.tryAdding(c)) continue;
            throw new RuntimeException();
        }
        return result;
    }

    private static String randomStr(Indexer<Character> indexer, Random rand, int length) {
        StringBuilder result = new StringBuilder();
        for (int i = 0; i < length; ++i) {
            result.append(indexer.i2o(rand.nextInt(indexer.size())));
        }
        return result.toString();
    }

    public static class PreprocessGutellDataMain
    implements Runnable {
        @Option(required=true)
        public String rnaFile;
        @Option
        public int nPerBlock = 5;
        @Option
        public int nBlocks = 10;
        @Option
        public boolean createPseudoRoot = false;
        @Option
        public boolean useAustoFormat = true;
        @Option
        public int maxSeqLen = 10;
        @Option
        public double blanksThreshold = 0.7;
        @Option(gloss="Restricted to Eukaryotes right now")
        public String classifRestriction = "";

        @Override
        public void run() {
            MultiAlignment ma = PreprocessGutellData.parse(this.rnaFile, false, this.classifRestriction);
            ArrayList<Taxon> species = new ArrayList<Taxon>(ma.getSequences().keySet());
            for (Taxon l : ma.getSequences().keySet()) {
                if (ma.getSequences().get(l).length() != 0) continue;
                LogInfo.warning("Removing " + l + " (empty sequence)");
                species.remove(l);
            }
            Collections.sort(species);
            Collections.shuffle(species, new Random(1L));
            LogInfo.logsForce("Writing full MSA to disk");
            IO.writeToDisk(Execution.getFile("align.msf"), ma.toMSFString());
            if (this.useAustoFormat) {
                LogInfo.logsForce("Writing Corpus format file");
                this.toAusFmt(ma, new File(Execution.getFile("corpus")));
            } else {
                File outDir = new File(Execution.getFile("output"));
                outDir.mkdir();
                for (int i = 0; i * this.nPerBlock < species.size() && i < this.nBlocks; ++i) {
                    HashSet<Taxon> currentSet = new HashSet<Taxon>(species.subList(i * this.nPerBlock, Math.min(species.size(), (i + 1) * this.nPerBlock)));
                    MultiAlignment restricted = MultiAlignment.restrict(ma, currentSet);
                    IO.writeToDisk(new File(outDir, "block_" + i + ".msf"), restricted.toMSFString());
                    IO.writeToDisk(new File(outDir, "block_" + i + ".fasta"), MultiAlignment.toFASTA(restricted));
                    IO.writeToDisk(new File(outDir, "block_" + i + ".weitree"), this.flatTree(new ArrayList<Taxon>(currentSet), this.createPseudoRoot));
                }
            }
        }

        private void toAusFmt(MultiAlignment ma, File file) {
            PrintWriter out = IOUtils.openOutHard(file);
            MultiAlignment.LinearizedAlignmentMatrix lam = ma.createAlignmentMatrix();
            ArrayList<Taxon> allLangs = new ArrayList<Taxon>(ma.getSequences().keySet());
            out.print("#id ");
            for (Taxon lang : allLangs) {
                out.print("" + lang + " ");
            }
            out.print("\n");
            for (int leftPos = 0; leftPos < lam.nCols() - 1; leftPos += this.maxSeqLen) {
                int rightExcl = Math.min(lam.nCols(), leftPos + this.maxSeqLen);
                StringBuilder current = new StringBuilder();
                double nonBlanks = 0.0;
                current.append("chunk_" + leftPos + "-" + rightExcl + " ");
                for (Taxon lang : allLangs) {
                    String cur = lam.substringWithoutGaps(lang, leftPos, rightExcl);
                    if (cur.length() > 0) {
                        current.append(cur);
                        nonBlanks += 1.0;
                    } else {
                        current.append("?emptystring");
                    }
                    current.append(" ");
                }
                current.append("\n");
                if (!(nonBlanks / (double)ma.getSequences().keySet().size() > this.blanksThreshold)) continue;
                out.print(current.toString());
            }
            out.close();
        }

        private String flatTree(List<Taxon> list, boolean createPseudoRoot) {
            Taxon root = null;
            if (createPseudoRoot) {
                root = list.remove(list.size() - 1);
            }
            StringBuilder result = new StringBuilder();
            result.append('(');
            for (int i = 0; i < list.size(); ++i) {
                result.append(list.get(i).toString());
                if (i == list.size() - 1) continue;
                result.append(',');
            }
            result.append(")");
            if (createPseudoRoot) {
                result.append("" + root);
            }
            result.append(";");
            return result.toString();
        }
    }
}

