シャコ・エビ日記

シャコパンチ、エビパチン研究者の記録

ヒストグラム自動描画プログラム

情報量統計学 (情報科学講座 A・5・4) (情報科学講座 (A・5・4))

情報量統計学 (情報科学講座 A・5・4) (情報科学講座 (A・5・4))


数値データずらずらっとあって、それらがどんな分布しているかを、推定する。
f:id:katsumushi:20070628094553j:image
ヒストグラムのBin幅は恣意的に選ぶことになる。これを赤池情報量基準を用いて最適な幅を算出している。ソースは武骨ながら以下。

#! /usr/local/bin/perl
use List::Util qw(max min);

sub say { print @_, "\n";}

while(<>){
  chomp;
  push @data, split;
}
@test =  split /\./ ,$data[0];
$npoint = length($test[1]);
$sampling_unit = 10**(-1*$npoint);
$max = max( @data ) + $sampling_unit/2;
$min = min( @data ) - $sampling_unit/2;
$len = $#data + 1;
$ncol= int(2*sqrt($len))-1;
$na = int( ($max - $min) / $sampling_unit );
$ncol = min($ncol, $na, 200);
$scale = 250;


$range = ( $max - $min ) / $ncol;
for (0..$ncol-1){
  $hist_col[$_] = 0;
}
for my $data (@data){
  for my $i (0..$ncol-1){
    push @classes, [ $min + $range*$i, $min + $range*( $i+1 ) ];
    if( $data > $min + $range * $i and $data < $min + $range * ($i+1)){
      $hist_col[$i]++;
    }
  }
}

for my $r ( 1..$ncol-2 ){
  for my $first ( 1..$ncol-2 ){
    for my $last ( 1..$ncol-2 ){
      if($ncol == $r + $first + $last ){
	my @divs = @{fdividors($r)};
	for(@divs){
	  push @models, [$first, $_, $r/$_, $last];
	}
      }
    }
  }
}

for my $m ( @models ){
  my $firstc;
  my $lastc;
  my @each_hists;
  my $n = 0;
  my $nme = 1;
  my $nm = 1;
  my @hist;
  my $mid_val;
  while ( $n <= $ncol-1 ){
    while ( $n <= $m->[0]-1 ){
      $firstc += $hist_col[$n];
      $n++;
    }
    push @hist, $firstc;
    while ( $n >= $m->[0] and $n <= ($ncol - $m->[3] - 1)){
     while ( $nm <= $m->[2]){
	while ( $nme <= $m->[1] ){
	  $mid_val += $hist_col[$n];
	  $nme++;
	  $n++;
	}
	push @hist, $mid_val;
	$nme = 1;
	$mid_val = 0;
	$nm++;
      }
    }
    while ( $n <= $ncol-1){
      $lastc += $hist_col[$n];
      $n++;
    }
    push @hist, $lastc;
  }
  my $histvalref = [ \@hist, $m->[0], $m->[1], $m->[2], $m->[3] ];
  my $aic = aic( $histvalref );
  push @hist_models, [ $aic, @{$histvalref} ];
}

@hist_models = sort { $a->[0] <=> $b->[0] } @hist_models;

$gc1 = $hist_models[0][2];
$gr = $hist_models[0][3];
$grn = $hist_models[0][4];
$gc2 = $hist_models[0][5];
$maice = $hist_models[0][0];
$ml = $#{$hist_models[0][1]};
$fn = 1;
say "MAICE: $maice,   C1: $gc1, R: $gr, C2: $gc2";
for(1..24){ print "-"; }
for(1..10){
  print "+";
  for(1..9){
    print "-";
  }
}
say;
$max_val = max(@{$hist_models[0][1]});
for (1.. $gc1 ){
  my $c1val = $hist_models[0][1][0];
  printf "%3d:", $fn;
  printf "%8.3f~%8.3f", $classes[$fn - 1]->[0], $classes[$fn -1]->[1];
  printf "%3d:", $hist_col[$fn - 1];
  print '*' x int($c1val/$gc1/$max_val*$scale);
  say;
  $fn++;
}
for my $i ( 1.. $grn ){
  for (1.. $gr){
    printf "%3d:", $fn;
    printf "%8.3f~%8.3f", $classes[$fn - 1]->[0], $classes[$fn -1]->[1];
    printf "%3d:", $hist_col[$fn-1];
    print '*' x int($hist_models[0][1][$i]/$gr/$max_val*$scale);
    say;
    $fn++;
  }
}
for ( 1..$gc2 ){
  printf "%3d:", $fn;
  printf "%8.3f~%8.3f", $classes[$fn - 1]->[0], $classes[$fn -1]->[1];
  printf "%3d:", $hist_col[$fn-1];
  print '*' x int($hist_models[0][1][$ml]/$gc2/$max_val*$scale);
  say;
  $fn++;
}
for(1..24){ print "-"; }
for(1..10){
  print "+";
  for(1..9){
    print "-";
  }
}
say;
say "::other top 20 models::";
printf "AIC:%8.3f::%4d:%4d:%4d\n",
$hist_models[$_][0], $hist_models[$_][2], $hist_models[$_][3], $hist_models[$_][5] for 1..20;

sub aic {
  my $l;
  my $ref = shift;
  my @vals = @{$ref->[0]};
  my ( $c1, $r, $nr, $c2 ) = ( $ref->[1], $ref->[2], $ref->[3], $ref->[4] );
  my $lncol= $#vals + 1;
  my $n = 0;
  my $ii = 1;
  for my $i ( @vals ) {
    if( $i == 0 ){
      $vals[$n] = 1/2.7182818284;
    }
    $n++;
  }
  $l += $vals[0] * log( $vals[0] / ( $c1 * $len ));
  while ( $ii <= $nr ){
    $l += $vals[ $ii ] * log( $vals[ $ii ] / ( $r * $len ));
    $ii++;
  }
  $l += $vals[$#vals] * log( $vals[$#vals] / ( $c2 * $len ));
  -2*$l + 2*( $lncol - 1 );
}


sub fdividors {
  my $num = shift;
  my $ndiv = 0;
  my $dividor = 1;
  my @dividors;
  while($dividor<=$num){
    if($num % $dividor == 0){
      push @dividors, $dividor;
      $ndiv++;
    }
    $dividor++;
  }
  \@dividors;
}


=>「統計分析用のVBAプログラム群
ここにエクセルのモノを発見。