Skip to content
Snippets Groups Projects
xval.c 8.50 KiB
/*************************************************************************/
/*									 */
/*  Copyright 2010 Rulequest Research Pty Ltd.				 */
/*									 */
/*  This file is part of C5.0 GPL Edition, a single-threaded version	 */
/*  of C5.0 release 2.07.						 */
/*									 */
/*  C5.0 GPL Edition is free software: you can redistribute it and/or	 */
/*  modify it under the terms of the GNU General Public License as	 */
/*  published by the Free Software Foundation, either version 3 of the	 */
/*  License, or (at your option) any later version.			 */
/*									 */
/*  C5.0 GPL Edition is distributed in the hope that it will be useful,	 */
/*  but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU	 */
/*  General Public License for more details.				 */
/*									 */
/*  You should have received a copy of the GNU General Public License	 */
/*  (gpl.txt) along with C5.0 GPL Edition.  If not, see 		 */
/*									 */
/*      <http://www.gnu.org/licenses/>.					 */
/*									 */
/*************************************************************************/



/*************************************************************************/
/*									 */
/*	Carry out crossvalidation trials				 */
/*	--------------------------------				 */
/*									 */
/*************************************************************************/

#include "defns.i"
#include "extern.i"


DataRec	*Blocked=Nil;
float	**Result=Nil;	/* Result[f][0] = tree/ruleset size
				    [1] = tree/ruleset errors
				    [2] = tree/ruleset cost  */



/*************************************************************************/
/*									 */
/*	Outer function (differs from xval script)			 */
/*									 */
/*************************************************************************/


void CrossVal()
/*   --------  */
{
    CaseNo	i, Size, Start=0, Next, SaveMaxCase;
    int		f, SmallTestBlocks, t, SaveTRIALS;
    ClassNo	c;
    static CaseNo *ConfusionMat=Nil;
    static int    SaveFOLDS=0;

    /*  Check for left-overs after interrupt  */

    if ( Result )
    {
	FreeVector((void **) Result, 0, SaveFOLDS-1);
	Free(ConfusionMat);
    }

    if ( FOLDS > MaxCase+1 )
    {
	fprintf(Of, T_FoldsReduced);
	FOLDS = MaxCase+1;
    }

    Result	 = AllocZero((SaveFOLDS = FOLDS), float *);
    Blocked	 = Alloc(MaxCase+1, DataRec);
    ConfusionMat = AllocZero((MaxClass+1)*(MaxClass+1), CaseNo);

    Prepare();

    SaveMaxCase = MaxCase;
    SaveTRIALS  = TRIALS;

    /*  First test blocks may be smaller than the others  */

    SmallTestBlocks = FOLDS - ((MaxCase+1) % FOLDS);
    Size = (MaxCase + 1) / FOLDS;

    ForEach(f, 0, FOLDS-1)
    {
	fprintf(Of, "\n\n[ " T_Fold " %d ]\n", f+1);
	Result[f] = AllocZero(3, float);

	if ( f == SmallTestBlocks ) Size++;
	MaxCase = SaveMaxCase - Size;

	ForEach(i, 0, MaxCase)
	{
	    Case[i] = Blocked[Start];
	    Start = (Start + 1) % (SaveMaxCase + 1);
	}

	ConstructClassifiers();

	/*  Check size (if appropriate) and errors  */

	if ( TRIALS == 1 )
	{
	    Result[f][0] = ( RULES ? RuleSet[0]->SNRules :
				     TreeSize(Pruned[0]) );
	    Next = Start;
	    ForEach(i, 0, Size-1)
	    {
		Case[i] = Blocked[Next];
		c = ( RULES ? RuleClassify(Blocked[Next], RuleSet[0]) :
			      TreeClassify(Blocked[Next], Pruned[0]) );
		if ( c != Class(Blocked[Next]) )
		{
		    Result[f][1] += 1.0;
		    if ( MCost )
		    {
			Result[f][2] += MCost[c][Class(Blocked[Next])];
		    }
		}

		/*  Add to confusion matrix for target classifier  */

		ConfusionMat[ Class(Blocked[Next])*(MaxClass+1)+c ]++;

		Next = (Next + 1) % (SaveMaxCase + 1);
	    }
	}
	else
	{
	    Result[f][0] = -1;
	    Next = Start;
	    Default = ( RULES ? RuleSet[0]->SDefault : Pruned[0]->Leaf );
	    ForEach(i, 0, Size-1)
	    {
		Case[i] = Blocked[Next];
		c = BoostClassify(Blocked[Next], TRIALS-1);
		if ( c != Class(Blocked[Next]) )
		{
		    Result[f][1] += 1.0;
		    if ( MCost )
		    {
			Result[f][2] += MCost[c][Class(Blocked[Next])];
		    }
		}

		/*  Add to confusion matrix for target classifier  */

		ConfusionMat[ Class(Blocked[Next])*(MaxClass+1)+c ]++;

		Next = (Next + 1) % (SaveMaxCase + 1);
	    }
	}

	Result[f][1] = (100.0 * Result[f][1]) / Size;
	Result[f][2] /= Size;

	fprintf(Of, T_EvalHoldOut, Size);
	MaxCase = Size-1;
	Evaluate(0);

	/*  Free space used by classifiers  */

	ForEach(t, 0, MaxTree)
	{
	    FreeClassifier(t);
	}
	MaxTree = -1;

	TRIALS = SaveTRIALS;
    }

    /*  Print summary of crossvalidation  */

    MaxCase = SaveMaxCase;

    Summary();
    PrintConfusionMatrix(ConfusionMat);

    /*  Free local storage  */

    ForEach(i, 0, MaxCase)
    {
	Case[i] = Blocked[i];
    }

    FreeVector((void **) Result, 0, FOLDS-1);		Result = Nil;
    Free(Blocked);					Blocked = Nil;
    Free(ConfusionMat);					ConfusionMat = Nil;
}



/*************************************************************************/
/*                                                                       */
/*      Prepare data for crossvalidation (similar to xval-prep.c)	 */
/*                                                                       */
/*************************************************************************/


void Prepare()
/*   -------  */
{
    CaseNo	i, First=0, Last, *Temp, Hold, Next=0;
    ClassNo	Group;
    Temp = Alloc(MaxCase+1, CaseNo);
    ForEach(i, 0, MaxCase)
    {
	Temp[i] = i;
    }

    Shuffle(Temp);

    /*  Sort into class groups  */

    while ( First <= MaxCase )
    {
	Last = First;
	Group = Class(Case[Temp[First]]);

	ForEach(i, First+1, MaxCase)
	{
	    if ( Class(Case[Temp[i]]) == Group )
	    {
		Last++;
		Hold = Temp[Last];
		Temp[Last] = Temp[i];
		Temp[i] = Hold;
	    }
	}

	First = Last+1;
    }

    /*  Organize into stratified blocks  */

    ForEach(First, 0, FOLDS-1)
    {
	for ( i = First ; i <= MaxCase ; i += FOLDS )
	{
	    Blocked[Next++] = Case[Temp[i]];
	}
    }

    Free(Temp);
}



/*************************************************************************/
/*                                                                       */
/*      Shuffle the data cases                                           */
/*                                                                       */
/*************************************************************************/


void Shuffle(int *Vec)
/*   -------  */
{
    int	This=0, Alt, Left=MaxCase+1, Hold;

    ResetKR(KRInit);

    while ( Left )
    {
	Alt = This + (Left--) * KRandom();

	Hold 	    = Vec[This];
	Vec[This++] = Vec[Alt];
	Vec[Alt]    = Hold;
    }
}


/*************************************************************************/
/*									 */
/*	Summarise a crossvalidation					 */
/*									 */
/*************************************************************************/


char
     *FoldHead[] = { F_Fold, F_UFold, "" };

void Summary()
/*   -------  */
{
    int		i, f, t;
    Boolean	PrintSize=true;
    float	Sum[3], SumSq[3];
    extern char	*StdP[], *StdPC[], *Extra[], *ExtraC[];

    for ( i = 0 ; i < 3 ; i++ )
    {
	Sum[i] = SumSq[i] = 0;
    }

    ForEach(f, 0, FOLDS-1)
    {
	if ( Result[f][0] < 1 ) PrintSize = false;
    }

    fprintf(Of, "\n\n[ " T_Summary " ]\n\n");

    ForEach(t, 0, 2)
    {
	fprintf(Of, "%s", FoldHead[t]);
	putc('\t', Of);
	if ( RULES )
	{
	    fprintf(Of, "%s", ( MCost ? ExtraC[t] : Extra[t] ));
	}
	else
	{
	    fprintf(Of, "%s", ( MCost ? StdPC[t] : StdP[t] ));
	}
	putc('\n', Of);
    }
    putc('\n', Of);

    ForEach(f, 0, FOLDS-1)
    {
	fprintf(Of, "%4d\t", f+1);

	if ( PrintSize )
	{
	    fprintf(Of, " %5g", Result[f][0]);
	}
	else
	{
	    fprintf(Of, "     *");
	}
	fprintf(Of, " %10.1f%%", Result[f][1]);

	if ( MCost )
	{
	    fprintf(Of, "%7.2f", Result[f][2]);
	}
	fprintf(Of, "\n");

	for ( i = 0 ; i < 3 ; i++ )
	{
	    Sum[i] += Result[f][i];
	    SumSq[i] += Result[f][i] * Result[f][i];
	}
    }

    fprintf(Of, "\n  " T_Mean "\t");

    if ( ! PrintSize )
    {
	fprintf(Of, "      ");
    }
    else
    {
	fprintf(Of, "%6.1f", Sum[0] / FOLDS);
    }

    fprintf(Of, " %10.1f%%", Sum[1] / FOLDS);

    if ( MCost )
    {
	fprintf(Of, "%7.2f", Sum[2] / FOLDS);
    }

    fprintf(Of, "\n  " T_SE "\t");

    if ( ! PrintSize )
    {
	fprintf(Of, "      ");
    }
    else
    {
	fprintf(Of, "%6.1f", SE(Sum[0], SumSq[0], FOLDS));
    }

    fprintf(Of, " %10.1f%%", SE(Sum[1], SumSq[1], FOLDS));

    if ( MCost )
    {
	fprintf(Of, "%7.2f", SE(Sum[2], SumSq[2], FOLDS));
    }
    fprintf(Of, "\n");
}



float SE(float sum, float sumsq, int no)
/*    --  */
{
    float mean;

    mean = sum / no;

    return sqrt( ((sumsq - no * mean * mean) / (no - 1)) / no );
}