/*
 * DMCBayes.java
 *
 * Author: Martin Christian <martin@christianix.de>
 * Created on 22. April 2003, 02:10
 */

package parser;

/************************************************************************
 * Die Wahrscheinlichkeit eines Token, ein Kennzeichen fuer Spam zu
 * sein berechnet sich wie folgt:
 *    n_spam   = Anzahl Spam des Token
 *    n_nospam = Anzahl nicht Spam des Token
 *    N_SPAM   = Anzahl Spam insgesamt
 *    N_NOSPAM = Anzahl nicht-Spam insgesamt
 *
 *                               ( n_spam / N_SPAM )
 * p_spam(token) =  -------------------------------------------------
 *                  ( (2 * n_nospam / N_NOSPAM) + (n_spam / N_SPAM) )
 *
 * Wenn das Token nie in Spam vorkommt: p_spam = 0.0000001
 * Wenn das Token immer in Spam vorkommt: p_spam = 0.9999999
 *
 * Die Wahrscheinlichkeit der Mail, Spam zu sein berechnet sich dann aus
 * den Wahrscheinlichkeiten der Token wie folgt:
 *
 *                                t1 * ... * tM
 * p_spam(mail) = ---------------------------------------------
 *                ( t1 * ... * tM ) + ( (1-t1) * ... * (1-tM) )
 *
 * @author  Martin Christian
 *************************************************************************/
public class DMCBayes implements DMCMethod {

	private double alpha;
	private double[] spamtoken;

	/** Creates a new instance of DMCBayes */
	public DMCBayes() {
		alpha = 0.95;
	}

	public DMCBayes( double alpha ) {
		this.alpha = alpha;
	}

	public String toString() {
		int n = spamtoken.length;
		StringBuffer sb = new StringBuffer( n * 4 );
		sb.append( "Alpha = " + this.alpha + "; [ " );
		for ( int i = 0; i<n; i++ ) {
			sb.append( this.spamtoken[i] );
			sb.append( " | " );
		}
		sb.delete( sb.length()-2, sb.length() );
		sb.append( "]" );
		return sb.toString();
	}

	public DMCStatistics eval( DMCDataset data, byte action ) {
		DMCStatistics statistics;

		double[] n_spamtoken = null;
		double[] n_nospamtoken = null;
		double[] spammail = null;
		int[] spamids = null;
		int[] isspam = null;
		int[] vector = null;
		int error_spam = 0;
		int error_nospam = 0;
		int m = data.getAttributes().length;
		int n = data.getN();
		int i, j;

		System.out.println( "Threshold for spam cut-off: "+this.alpha );
		switch ( action ) {
			case DMCMethod.DMC_ALL:			// do everything
			case DMCMethod.DMC_TRAIN:		// calculate spamability per attribute
				if ( !data.isValidation() ) {
					m--;
					isspam = data.getColumn( m );
					n_spamtoken = new double[m];
					n_nospamtoken = new double[m];
					this.spamtoken = new double[m];
					// calculate spamability of each token
					System.out.print("Calculate spamability of each attribute...");
					for ( j=1; j<m; j++ ) {
						vector = data.getColumn( j );
						n_spamtoken[j] = 0;
						n_nospamtoken[j] = 0;
						// count spam/nospam count of each token
						for ( i=0; i<n; i++ ) {
							if ( vector[i] == 1 ) {
								if ( isspam[i] == 1 ) n_spamtoken[j]++;
								else n_nospamtoken[j]++;
							}
						}
						if ( n_spamtoken[j] == 0 ) spamtoken[j] = 0.01;
						else if ( n_nospamtoken[j] == 0 ) spamtoken[j] = 0.99;
						else spamtoken[j] = ( n_spamtoken[j] / data.getSpam() ) /
																( (n_spamtoken[j] / data.getSpam()) +
																  (2.0*n_nospamtoken[j] / ( n-data.getSpam() )) );
					}
					System.out.println("done.");
				}
				if ( action == DMCMethod.DMC_TRAIN ) break;
			case DMCMethod.DMC_APPLY:		// calculate spam probabilities per mail
				double a, b;
				int spamcounter = 0;
				// calculate spam propability for each mail
				spammail = new double[n];
				if ( !data.isValidation() ) m--;
				System.out.print("Calculate spam probabilities for each mail...");
				for ( i=0; i<n; i++ ) {
					vector = data.getRow( i );
					a = 1.0;
					b = 1.0;
					for ( j=1; j<m; j++ ) {
						if ( vector[j] == 1 ) {
							a *= spamtoken[j];
							b *= ( 1 - spamtoken[j] );
						}
					}
					spammail[i] = a / ( a + b );
					if ( spammail[i] > alpha ) spamcounter++;
				}
				spamids = new int[spamcounter];
				j = 0;
				for ( i=0; i<n; i++ ) {
					if ( spammail[i] > alpha ) {
						vector = data.getRow( i );
						spamids[j] = vector[0];
						j++;
					}
				}
				System.out.println("done.");
		}

		statistics = new DMCStatistics( data, spammail, spamids );
		return statistics;
	}

	public DMCStatistics eval(DMCDataset traindata, DMCDataset testdata) {
		this.eval( traindata, DMCMethod.DMC_TRAIN );
		return this.eval( testdata, DMCMethod.DMC_APPLY );
	}
	
	public void setThreshold(double alpha) {
		this.alpha = alpha;
	}
	
	public double getThreshold() {
		return this.alpha;
	}
}
