/*
 * DMCDataset.java
 *
 * Author: Martin Christian <martin@christianix.de>
 * Created on 7. Mai 2003, 23:26
 */

package parser;

import java.io.FileReader;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.StringTokenizer;

/**
 *
 * @author  mai01edz
 */
public class DMCDataset {

	private String[] attributes;
	private int table[][];
	private String filename;
	private int n; // count rows
	private int m; // count columns
	private int spam;
	private int nospam;
	private boolean validation;

	/** Creates a new instance of DMCDataset */
	public DMCDataset( int n, String[] attributes ) {
		this.n = n;
		this.m = attributes.length;
		this.attributes = attributes;
		this.table = new int[this.n][this.m];
		this.filename = null;
		this.spam = 0;
		this.nospam = 0;
		this.validation = true;
		if ( attributes[m-1].compareToIgnoreCase("target") == 0 )
			this.validation = false;
	}

	public DMCDataset( String filename ) {
		StringTokenizer tokenizer = null;
		BufferedReader reader = null;
		String line;
		int i, j, input;

		this.filename = filename;
		this.spam = 0;
		this.nospam = 0;
		try {
			reader = new BufferedReader( new FileReader(filename) );
			line = reader.readLine();
			tokenizer = new StringTokenizer( line );
			m = tokenizer.countTokens();
			attributes = new String[m];
			
			// read column lables
			System.out.print( "Reading column lables..." );
			for ( i=0; i<m; i++ ) attributes[i] = tokenizer.nextToken();
			System.out.println( "done." );

  		// count lines
			n=0;
			while ( (line=reader.readLine()) != null ) n++;
			reader.close();
			reader = new BufferedReader( new FileReader(filename) );
			line = reader.readLine();

			// read data
			System.out.print( "Reading data..." );
			table = new int[n][m];
			input = 0;
			for ( i=0; i<n; i++ ) {
				line = reader.readLine();
				tokenizer = new StringTokenizer( line );
				for ( j=0; j<m; j++ ) {
					input = Integer.parseInt( tokenizer.nextToken() );
					table[i][j] = input;
				}
				if ( input == 1 ) this.spam++;
				else this.nospam++;
			}
		  if ( reader != null ) reader.close();
			System.out.println( "done." );
			System.out.println( "Columns in Dataset: " + m );
			System.out.println( "Observations read: " + n );
		} catch ( IOException ioe ) {
			  try {
				  if ( reader != null ) reader.close();
				} catch ( IOException e ) {}
				ioe.printStackTrace();
				System.exit(-1);
		} catch ( NumberFormatException nfe ) {
				nfe.printStackTrace();
				System.exit(-1);
		}
		this.validation = true;
		if ( attributes[m-1].compareToIgnoreCase("target") == 0 )
			this.validation = false;
	}

	public void addObservation( int i, int[] row ) {
		if ( i < n ) {
			table[i] = row;
			if ( row[this.m-1] == 1 ) this.spam++;
			else this.nospam++;
		}
		else System.err.println( "Can't add observation, array out of bounds!" );
	}

	public boolean isValidation() {
		return this.validation;
	}

	public int getSpam() {
		return this.spam;
	}

	public int getNoSpam() {
		return this.nospam;
	}

	public int getN() {
		return this.spam + this.nospam;
	}

	public int[][] getTable() {
		return table;
	}

	public String[] getAttributes() {
		return attributes;
	}

	public int[] getRow( int row ) {
		return (int[])table[row].clone();
	}

	public int[] getColumn( int col ) {
		int[] newcol = new int[n];
		for ( int i=0; i<this.n; i++ ) newcol[i] = table[i][col];
		return newcol;
	}

	public DMCDataset getSubset( double p ) {
		int i, j;
		double step, counter;
		DMCDataset dmcdata;

		// Berechnung des Speicherplatzes fr den neuen Datensatz
		step = this.n;
		counter = Math.ceil( step * p );  // aufrunden von n*p
		dmcdata = new DMCDataset( (int)counter, this.attributes );

		i = 0;
		j = 0;
		counter = 0;
		if ( p > 0 ) step = 1.0 / p;
		else return null;
		while ( i < n ) {
			dmcdata.addObservation( j, this.table[i] );
			j++;
			counter += step;
			i = (int)counter;
		}

		return dmcdata;
	}

	public DMCDataset[] splitData( double p ) {
		int i, j, traini, testi;
		double step, counter;
		DMCDataset dmctrain;
		DMCDataset dmctest;
		DMCDataset[] dmcdatasets;

		if ( p > 0 ) {
			step = 1.0 / p;
			// Berechnung des Speicherplatzes fr den neuen Datensatz
			counter = Math.ceil( p * this.n );  // aufrunden von n*p
			i = (int)counter;
			dmctrain = new DMCDataset( i, this.attributes );
			dmctest = new DMCDataset( n-i, this.attributes );

			j = 0;
			counter = 0;
			traini = 0;
			testi = 0;
			for ( i=0; i<this.n; i++ ) {
				if ( i == j ) {
					dmctrain.addObservation( traini, this.table[i] );
					traini++;
					counter += step;
					j = (int)counter;
				}
				else {
					dmctest.addObservation( testi, this.table[i] );
					testi++;
				}
			}
			System.out.println( "Obs. in training dataset: " + traini );
			System.out.println( "Obs. in test dataset: " + testi );
		}
		else {
			dmctrain = null;
			dmctest = this;
		}

		dmcdatasets = new DMCDataset[2];
		dmcdatasets[0] = dmctrain;
		dmcdatasets[1] = dmctest;
		return dmcdatasets;
	}
}
