/*
 * Decompiled with CFR 0.152.
 */
package ev.ex;

import fig.basic.IOUtils;
import fig.basic.LogInfo;
import fig.basic.Option;
import fig.exec.Execution;
import java.io.File;
import java.io.PrintWriter;
import java.util.List;
import nuts.io.IO;

public class GenerateSecondaryPredictionDataset
implements Runnable {
    @Option(required=true)
    public String rcsbFile;
    @Option
    public String dsspString = "dsspcmbi";
    @Option
    public String wgetString = "wget";

    public static void main(String[] args) {
        IO.run(args, new GenerateSecondaryPredictionDataset());
    }

    @Override
    public void run() {
        File pdbFiles = new File(Execution.getFile("pdbFiles"));
        pdbFiles.mkdir();
        String ids = IO.f2s(this.rcsbFile);
        String[] idsarr = ids.split("[,]\\s*");
        int i = 0;
        LogInfo.track("Downloading data from rcsb");
        for (String id : idsarr) {
            LogInfo.logs("Downloading " + ++i + "/" + idsarr.length);
            IO.call(this.wgetString + " " + "http://www.rcsb.org/pdb/files/" + id + ".pdb", null, pdbFiles);
        }
        LogInfo.end_track();
        File dsspFiles = new File(Execution.getFile("dsspFiles"));
        dsspFiles.mkdir();
        LogInfo.track("Converting to dssp");
        List<File> pdbFiless = IO.ls(pdbFiles, "pdb");
        i = 0;
        for (File current : pdbFiless) {
            LogInfo.logs("Processing " + ++i + "/" + pdbFiless.size());
            File dsspFile = new File(dsspFiles, current.getName());
            IO.call(this.dsspString + " " + current.getAbsolutePath() + " " + dsspFile.getAbsolutePath().replaceAll("pdb$", "dssp"));
        }
        LogInfo.end_track();
        File output = new File(Execution.getFile("dataset"));
        output.mkdir();
        IO.writeToDisk(new File(output, "metadata"), "name: protein-secondary-structure\ndescription: Input: Amino acid, Output: Secondary struct as defined by DSSP. Data downloaded from rcsb on epoch " + System.currentTimeMillis() + " with filters: " + "res. > 2.5 A, X-ray, seq.len. > 50\n" + "format: SequenceTagging");
        List<File> dsspFilesList = IO.ls(dsspFiles, "dssp");
        int nTrain = (int)(0.8 * (double)dsspFilesList.size());
        this.createMLComp(dsspFilesList.subList(0, nTrain), new File(output, "train"));
        this.createMLComp(dsspFilesList.subList(nTrain, dsspFilesList.size()), new File(output, "test"));
    }

    private void createMLComp(List<File> subList, File file) {
        PrintWriter out = IOUtils.openOutEasy(file);
        LogInfo.track("Creating file " + file);
        int i = 0;
        for (File f : subList) {
            LogInfo.logs("Processing " + ++i + "/" + subList.size());
            this.convertDSSPFormat(f, out);
        }
        LogInfo.end_track();
        out.close();
    }

    private void convertDSSPFormat(File f, PrintWriter out) {
        boolean started = false;
        for (String line : IO.i(f)) {
            if (started && line.length() > 16) {
                char AA = line.charAt(13);
                char SS = line.charAt(16);
                if (!("" + AA).matches("[A-Z]")) continue;
                if (SS == ' ') {
                    SS = 'C';
                }
                if (!("" + SS).matches("[HBEGITS]")) {
                    // empty if block
                }
                out.println("" + AA + " " + SS);
                continue;
            }
            if (!line.matches(".*[#].*KAPPA.*ALPHA.*PHI.*PSI.*")) continue;
            started = true;
        }
        out.println();
    }
}

