// This DataCollection class reads the datset from the file and stores it
// the constructor contains the file parser
class DataCollection
{
  int  numVars = 0 ;    // number of variables
  int  numCases = 0 ;   // number of data cases
  String[]  labels ;    // label for each variable (per column)
  float[]  minV ;       // minimum value per variable
  float[]  maxV ;       // maximum value per variable
  float[][] theData ;   // the actual (numeric) data
  boolean[]  isText ;   // marks whether a variable is actually text
  String[][] dataStrings ;  // collection of data case labels
  boolean dataStringsMade = false ;
  
  DataCollection( String fileNm )
  {
    int  i ;
    int startL = 0 ;
    String lines[] = loadStrings( fileNm );    // read in the entire file

    println("there are " + lines.length + " lines");

    // check first line for labels
    // chop the first line into an array of tokens not including commas
    String[]  firstL = splitTokens( lines[0], " ," ); 
    
    if( firstL.length == 0 )
    {
      println( "Error: Data file has empty first line\nQuitting" );
      numVars = -1 ;
      return ;
    }

    // try to parse the first line as Float. if it fails, we have labels!
    // if it succeeds, then create fake labels Var0, Var1...
    try 
    { 
      float x = Float.parseFloat( firstL[0] );
      // exception will be caught if the first line is non-numeric
      labels =  split( lines[0], "," );
      if( labels.length > 0 )
        numVars = labels.length ;
      else
      {
        println( "Error: Data file has empty content\nQuitting" );
        numVars = -1 ;
        return ;
      }
      // create fake labels
      for( i = 0 ; i < numVars ; i++ )
      {
        labels[i] = "Var" + i ;
      }
    } 
    catch( NumberFormatException nFE )   // found non-numeric in first line
    { 
      // println( "first line is labels:" + lines[0] );
      labels = split( lines[0], "," );
      if( labels.length > 0 )
        numVars = labels.length ;
      else
      {
        println( "Error: Data file has empty content\nQuitting" );
        numVars = -1 ;
        return ;
      }
      startL = 1 ;    // set the initial line to index 1
    }
    numCases = lines.length - startL ;
    
    print( "Data Variable Labels: " );
    for(  i = 0 ; i < numVars ; i++ )
      print( "<" + labels[i] + "> " );
    println();
   
    // now actually create the arrays
    theData = new float[numCases][numVars] ;
    
    maxV = new float[numVars] ;
    minV = new float[numVars] ;
    isText = new boolean[numVars] ;
    for( int j = 0 ; j < numVars ; j++ )
    {
      maxV[j] = -1000000.0 ;
      minV[j] =  1000000.0 ;
      isText[j] = false ;
    }
    
    String[] caseArray = split( lines[startL], "," );
    for( i = 0 ; i < caseArray.length ; i++ )
    {
      float f ;
       print( "<" + caseArray[i] + "> " );
       try
       {
         f = Float.parseFloat( caseArray[i] );
         // do nothing more. variable is float
       }
       catch( Exception e )
      {
         isText[i] = true ;
         if( !dataStringsMade )
         {
           dataStrings = new String[numCases][numVars] ;
           dataStringsMade = true ;
         }
        print( "TEXT " );
      }
    }
    println();
   
    int li ;
    for( li = startL, i = 0 ; i < numCases ; i++, li++ ) 
    {
      for( int j = 0 ; j < numVars ; j++ )
        theData[i][j] = 0.0 ;      // set a default value
        
      caseArray = split( lines[li], "," );

      for( int j = 0 ; j < caseArray.length ; j++ )
      {
        if( !isText[j] )
        {
          try
          {
            theData[i][j] = Float.parseFloat( caseArray[j] );
            if( theData[i][j] > maxV[j] )
              maxV[j] = theData[i][j] ;
            if( theData[i][j] < minV[j] )
              minV[j] = theData[i][j] ;
          }
          catch( Exception e )
          {
          }
        }
        else
        {
          dataStrings[i][j] = caseArray[j] ;
        }
      }
    }

    SortedSet labelSet = new TreeSet();
    String[] uniqLabels ;
    for( int j = 0 ; j < numVars ; j++ )
    {
      if( !isText[j] )
        continue ;
      labelSet.clear();
      for( i= 0 ; i < numCases ; i++ ) 
      {
        labelSet.add( dataStrings[i][j] );

      }
        print( labelSet );
      uniqLabels = new String[labelSet.size()] ;
      Iterator it = labelSet.iterator();
      i = 0 ;
      while( it.hasNext() )
      {
        uniqLabels[i] = (String)it.next();
          println( " uniqL = " + uniqLabels[i] );
          i++;
      }
      for( i= 0 ; i < numCases ; i++ ) 
      {
      println( "Var number " +  j  );
        for( int k = 0 ; k < uniqLabels.length ; k++ )
        {

           if( uniqLabels[k].equals( dataStrings[i][j] ) )
           {
              theData[i][j] = (float) k ;
               println( " uniqL = " + uniqLabels[k] + " " + k );
           }
        }
      }
      minV[j] = 0.0 ;
      maxV[j] = (float)(uniqLabels.length -1) ;
    }

  }
  
  
  
  int getNumVars()
  {
    return( numVars );
  }
  int getNumCases()
  {
    return( numCases );
  }
  
  float getData(int caseIdx, int varIdx )
  {
    return( theData[caseIdx][varIdx] );
  }
  
  float getMin( int varIdx )
  {
    return( minV[varIdx] );
  }
  float getMax( int varIdx )
  {
    return( maxV[varIdx] );
  }
  String getVarName( int varIdx )
  {
    return( labels[varIdx] );
  }
  int getVarIdx( String varNm )
  {
    for( int i = 0 ; i < numVars ; i++ )
    {
      if( labels[i].equals( varNm ) )
        return( i );
    }
    return( -1 );
  }

}

