Skip to content
Snippets Groups Projects
info.c 5.05 KiB
/*************************************************************************/
/*									 */
/*  Copyright 2010 Rulequest Research Pty Ltd.				 */
/*									 */
/*  This file is part of C5.0 GPL Edition, a single-threaded version	 */
/*  of C5.0 release 2.07.						 */
/*									 */
/*  C5.0 GPL Edition is free software: you can redistribute it and/or	 */
/*  modify it under the terms of the GNU General Public License as	 */
/*  published by the Free Software Foundation, either version 3 of the	 */
/*  License, or (at your option) any later version.			 */
/*									 */
/*  C5.0 GPL Edition is distributed in the hope that it will be useful,	 */
/*  but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU	 */
/*  General Public License for more details.				 */
/*									 */
/*  You should have received a copy of the GNU General Public License	 */
/*  (gpl.txt) along with C5.0 GPL Edition.  If not, see 		 */
/*									 */
/*      <http://www.gnu.org/licenses/>.					 */
/*									 */
/*************************************************************************/



/*************************************************************************/
/*									 */
/*	Calculate information, information gain, and print dists	 */
/*	--------------------------------------------------------	 */
/*									 */
/*************************************************************************/


#include "defns.i"
#include "extern.i"


/*************************************************************************/
/*									 */
/*	Given Freq[][] and ValFreq[], compute the information gain.	 */
/*									 */
/*************************************************************************/


double ComputeGain(double BaseInfo, float UnknFrac, DiscrValue MaxVal,
		   CaseCount TotalCases)
/*     -----------  */
{
    DiscrValue	v;
    double	ThisInfo=0.0;

    /*  Check whether all values are unknown or the same  */

    if ( ! TotalCases ) return None;

    /*  Compute total info after split, by summing the
	info of each of the subsets formed by the test  */

    ForEach(v, 1, MaxVal)
    {
	ThisInfo += TotalInfo(GEnv.Freq[v], 1, MaxClass);
    }
    ThisInfo /= TotalCases;

    /*  Set the gain in information for all cases, adjusted for unknowns  */

    return ( BaseInfo <= ThisInfo ? 0.0 :
	     (1 - UnknFrac) * (BaseInfo - ThisInfo) );
}



/*************************************************************************/
/*									 */
/*	Compute the total information in V[ MinVal..MaxVal ]		 */
/*									 */
/*************************************************************************/

// Return Value = (P+N)*log(P+N) - P*log(P) - N*log(N)
double TotalInfo(double V[], DiscrValue MinVal, DiscrValue MaxVal)
/*     ---------  */
{
    DiscrValue	v;
    double	Sum=0.0, TotalCases=0;
    CaseCount	N;

    ForEach(v, MinVal, MaxVal)
    {
	N = V[v];

	Sum += N * Log(N);
	TotalCases += N;
    }

    return TotalCases * Log(TotalCases) - Sum;
}

/*************************************************************************/
/*									 */
/*	Compute the information in an Implication Sample 		 */
/*									 */
/*************************************************************************/

double ImplicationInfo(double numPositive, double numNegative, double numImplications)
/*     ---------  */
{
    double probabilityPos = 0.0, probabilityNeg = 0.0;

    if (numImplications == 0.0)
    {
	probabilityPos = numPositive / ( numPositive + numNegative );
	probabilityNeg = 1.0 - probabilityPos;

    }
    else
    {

    	probabilityPos = ( pow(pow(numPositive + numNegative - numImplications, 2) + 4 * numImplications * numPositive, 0.5)
				- (numPositive + numNegative - numImplications) ) / ( 2 * numImplications ) ;

    	probabilityNeg = 1.0 - probabilityPos;
    }

    #if false
    double a = Log(2);
    double b = Log(probabilityPos);
    double c = Log(probabilityNeg);
    double d = Log(0.5);
    #endif

    double result1 = (-1 * probabilityPos * Log(probabilityPos));
    double result2 = (-1 * probabilityNeg * Log(probabilityNeg)) ;
    double result = (result1 + result2);
    return result;

}



/*************************************************************************/
/*									 */
/*	Print distribution table for given attribute			 */
/*									 */
/*************************************************************************/


void PrintDistribution(Attribute Att, DiscrValue MinVal, DiscrValue MaxVal,
		       double **Freq, double *ValFreq, Boolean ShowNames)
/*   -----------------  */
{
    DiscrValue v;
    ClassNo c;
    String Val;

    fprintf(Of, "\n\t\t\t ");
    ForEach(c, 1, MaxClass)
    {
	fprintf(Of, "%7.6s", ClassName[c]);
    }
    fprintf(Of, "\n");

    ForEach(v, MinVal, MaxVal)
    {
	if ( ShowNames )
	{
	    Val = ( ! v ? "unknown" :
		    MaxAttVal[Att] ? AttValName[Att][v] :
		    v == 1 ? "N/A" :
		    v == 2 ? "below" : "above" );
	    fprintf(Of, "\t\t[%-7.7s:", Val);
	}
	else
	{
	    fprintf(Of, "\t\t[%-7d:", v);
	}

	ForEach(c, 1, MaxClass)
	{
	    fprintf(Of, " %6.1f", Freq[v][c]);
	}

	fprintf(Of, "]\n");
    }
}