TDistribution class

This article gives the source code for a nice, clean implementation of a class to compile the discrete, cumulative distribution function of a sample data set. The CDF forms the basis for basic descriptive statistics.

(This article originally appeared in The Unofficial Newsletter of Delphi

Users
)



TDistribution Class



This article gives the source code for a nice, clean implementation of a class to compile the discrete, cumulative distribution function of a sample data set. The CDF forms the basis for basic descriptive statistics.



TDistribution will automatically add 'bins' to the cdf as data elements are accumulated, or you can pre-define the bin boundaries before analyzing a data set. This auto-expansion of the cdf bins works best with sorted data sets (but for cool sorting components see http://www.connix.com/~btober/sorting.htm). In most cases it will work reasonably well without pre-sorting your data.



The class descends from TStringList, so you can use the Strings property to define a text string describing each 'bin', e.g.



  String[0]:='Jan';

  String[1]:='Feb';

  .

  .

  .

 String[11]:='Dec';



Thus the class flexibly supports your implementation within a variety of user interfaces (note that this is not a user interface component - it is "merely" a class that provides a specific, statistical functionality...it's up to you to implement the user interface appropriate for your particular application).



The class provides properties for estimating the sample data total and mean (based on the cdf).



An example project is also included.



{ ****************************************************************** }

{ Class for developing discrete, cumulative distribution function }

{ Copyright © 2000, Berend M. Tober. All rights reserved. }

{ Author's E-mail - mailto:btober@computer.org }

{ Other components at }

{ http://www.connix.com/~btober/delphi.htm}

{ ****************************************************************** }

unit Cdf;



{

  The cumulative distribution function (cdf) for continuous, real

  random variable X is defined as a function F(x) where



    F(x) = P(X <= x), i.e., the probability that X <= x.



  The discrete case is similarly defined.



  The TDistribution class is used to generate an empirical CDF for a

  given data set by counting the number of values from the sample

  data set that fall into one of a set of discrete "bins".



  This way you can quickly get a quantitative picture of a data set.



  The class descends from TStringList, so you can use the Strings

  property to define a text string describing each 'bin', e.g.



    String[0]:='Jan';

    String[1]:='Feb';

      .

      .

      .

    String[11]:='Dec';

}




interface



uses classes;



type

  TDistributionItem = class(TObject)

    private

      FBin: Double; {Upper limit of bin}

      FCount: LongInt;

    public

      constructor Create(Value: Double;Count:LongInt);

      function Accumulate(Value: Double):LongInt;

      property Bin: Double Read FBin;

      property Count: LongInt Read FCount;

  end;



  TDistribution = class(TStringList)

    private

      function GetMean: double;

      function GetTotal: Double;

    public

      constructor Create;

      destructor Destroy;override;

      procedure Clear;

      function Accumulate(Value: Double):LongInt;

      function AddObject(const S: string; AObject: TObject):Integer;override;

      procedure FreeObjects;

      procedure Put(Index:Integer; const Value:TDistributionItem);

      function FreeObject(Index:Integer):Integer;

      function Get(Index:Integer):TDistributionItem;

      property Mean: Double Read GetMean;

      property Objects[Index:Integer]:TDistributionItem read Get write Put;

      property Total: Double Read GetTotal;

  end;





implementation



constructor TDistributionItem.Create(Value: Double;Count:LongInt);

begin

  inherited Create;

  FBin := Value;

  FCount:=Count;

end;



function TDistributionItem.Accumulate(Value: Double):LongInt;

begin

  Result:=-1;

  if Value<=FBin then {Increment count of bin when Value<=x}

    begin

    inc(FCount);

    Result:=FCount;

    end;

end;





constructor
TDistribution.Create;

begin

  inherited Create;

end;



destructor TDistribution.Destroy;

begin

  Clear;

  inherited Destroy;

end;



function TDistribution.AddObject(const S: string; AObject:TObject):Integer;

{Add a 'bin' to the CDF, in proper order}


var i:Integer;

begin

  {

    Find where to insert new 'bin'. This is just before the smallest 'bin'

    which exceeds size of new 'bin'.

  }


  Result:=Count;

  if Count>0 then

    for i:=pred(Count) downto 0 do

      if TDistributionItem(AObject).Bin<Objects[i].Bin then

        Result:=i;



  if Result<0 then

    Result:=inherited AddObject(S,AObject) {If no such 'bin', append new one}

  else

    InsertObject(Result,S,AObject) {Insert new bin before next biggest}

end;



function TDistribution.Accumulate(Value:Double):LongInt;

{Count this data value into the cdf}

var

  i: LongInt;

begin

  if Count=0 then

    AddObject('',TDistributionItem.Create(Value,0)) {Must have at least one 'bin'}

  else if Value>Objects[pred(Count)].Bin then

    {If Value exceeds largest 'bin', then add new one that IS big enough}

    AddObject('',TDistributionItem.Create(Value,Objects[pred(Count)].Count));



  for i:=0 to pred(Count) do

    Result:=Objects[i].Accumulate(Value);

end;



function TDistribution.Get(Index:Integer):TDistributionItem;

begin

  Result:=TDistributionItem(inherited Objects[Index]);

end;



function TDistribution.FreeObject(Index:Integer):Integer;

begin

  Result:=-1;

  if Index >= Count then Exit;

  if Objects[Index] <> nil then

    begin

    Objects[Index].Free;

    Objects[Index]:=nil;

    end;

  Delete(Index);



  if Index>=Count then

    Result:=pred(Count)

  else if Count=0 then

    Result:=-1

  else

    Result:=Index;

end;



procedure TDistribution.FreeObjects;

var i:Integer;

begin

  if Count > 0 then

    for i:=pred(Count) downto 0 do

      FreeObject(i);

end;



procedure TDistribution.Put(Index:Integer; const Value:TDistributionItem);

begin

  inherited Objects[Index]:=Value;

end;



procedure TDistribution.Clear;

begin

  FreeObjects;

  inherited Clear;

end;



function TDistribution.GetTotal:Double;

{This is an ESTIMATE of the actual sample total}

var i: integer;

begin

  Result:=0;

  if Count = 0 then Exit;



  Result:=Objects[0].Bin*Objects[0].Count;

  for i:=1 to pred(Count) do

    Result:=Result+(Objects[i].Bin*(Objects[i].Count-Objects[pred(i)].Count));

  {

*** This is an alternative way to compute total ***



  Result:=Objects[0].Bin*Objects[0].Count;

  for i:=1 to pred(Count) do

    Result:=Result

      +(Objects[i].Bin + Objects[pred(i)].Bin)

      *(Objects[i].Count - Objects[pred(i)].Count);

  Result:=Result/2.0;

  }


end;



function TDistribution.GetMean: double;

{This is an ESTIMATE of the actual sample mean}

begin

  Result:=0.0;

  if Count>0 then

    Result:=GetTotal/Objects[pred(Count)].Count;

end;



end.



Example implementation



program Example;



uses

  WinCRT,cdf;



const



  Data1:Array[1..10] of Real=(66,73,73,81,81,81,81,85,85,89);

  {

    Note: This data set is pre-sorted. TDistribution will still

    work with unsorted data, but you might not get a good cdf

    unless you predefine the cdf bins. This is especially true

    if the first element of the dataset happens to be the largest

    (try it by re-arranging the above data set!) because automatic

    addition of bins will add a single bin into which ALL data

    will be counted.

  }




var

  i,j:Word;



begin

  with TDistribution.Create do

    begin

    for i:=1 to 10 do

      Accumulate(Data1[i]);



    writeln('Mean = ', Mean:6:4);

    writeln('Total = ', Total:6:4);



{Print out quantitative summary}

    for i := 0 to pred(Count) do

      writeln(Objects[i].Bin:6:2,#44,Objects[i].Count,#44,Objects[i].Count/Objects[pred(Count)].Count:6:3);



{Print out crude 'histogram'}

    for i := 0 to pred(Count) do

      begin

      for j:=1 to trunc(40*Objects[i].Count/Objects[pred(Count)].Count) do

        write('*');

      writeln;

      end;

    Free;

    end;



end.

 

Share this article!

Follow us!

Find more helpful articles: