Verify distribution uniformity/Naive: Difference between revisions
→{{header|OCaml}}: Added PureBasic |
Added PicoLisp |
||
Line 333: | Line 333: | ||
) h; |
) h; |
||
;;</lang> |
;;</lang> |
||
=={{header|PicoLisp}}== |
|||
The following function takes a count, and allowed deviation in per mill |
|||
(one-tenth of a percent), and a 'prg' code body (i.e. an arbitrary number of |
|||
executable expressions). |
|||
<lang PicoLisp>(de checkDistribution (Cnt Pm . Prg) |
|||
(let Res NIL |
|||
(do Cnt (accu 'Res (run Prg 1) 1)) |
|||
(let |
|||
(N (/ Cnt (length Res)) |
|||
Min (*/ N (- 1000 Pm) 1000) |
|||
Max (*/ N (+ 1000 Pm) 1000) ) |
|||
(for R Res |
|||
(prinl (cdr R) " " (if (>= Max (cdr R) Min) "Good" "Bad")) ) ) ) )</lang> |
|||
Output: |
|||
<pre>: (checkDistribution 100000 5 (rand 1 7)) |
|||
14299 Good |
|||
14394 Bad |
|||
14147 Bad |
|||
14418 Bad |
|||
14159 Bad |
|||
14271 Good |
|||
14312 Good</pre> |
|||
=={{header|PureBasic}}== |
=={{header|PureBasic}}== |
||
<lang PureBasic>Prototype RandNum_prt() |
<lang PureBasic>Prototype RandNum_prt() |
Revision as of 15:53, 22 June 2010
You are encouraged to solve this task according to the task description, using any language you may know.
This task is an adjunct to Seven-dice from Five-dice.
Create a function to check that the random integers returned from a small-integer generator function have uniform distribution.
The function should take as arguments:
- The function (or object) producing random integers.
- The number of times to call the integer generator.
- A 'delta' value of some sort that indicates how close to a flat distribution is close enough.
The function should produce:
- Some indication of the distribution achieved.
- An 'error' if the distribution is not flat enough.
Show the distribution checker working when the produced distribution is flat enough and when it is not. (Use a generator from Seven-dice from Five-dice).
See also:
AutoHotkey
<lang AutoHotkey>MsgBox, % DistCheck("dice7",10000,3)
DistCheck(function, repetitions, delta)
{ Loop, % 7 ; initialize array
{ bucket%A_Index% := 0 } Loop, % repetitions ; populate buckets { v := %function%() bucket%v% += 1 }
lbnd := round((repetitions/7)*(100-delta)/100) ubnd := round((repetitions/7)*(100+delta)/100) text := "Distribution check:`n`nTotal elements = " repetitions . "`n`nMargin = " delta "% --> Lbound = " lbnd ", Ubound = " ubnd "`n" Loop, % 7 { text := text "`nBucket " A_Index " contains " bucket%A_Index% " elements." If bucket%A_Index% not between %lbnd% and %ubnd% text := text " Skewed." } Return, text
}</lang>
Distribution check: Total elements = 10000 Margin = 3% --> Lbound = 1386, Ubound = 1471 Bucket 1 contains 1450 elements. Bucket 2 contains 1374 elements. Skewed. Bucket 3 contains 1412 elements. Bucket 4 contains 1465 elements. Bucket 5 contains 1370 elements. Skewed. Bucket 6 contains 1485 elements. Skewed. Bucket 7 contains 1444 elements.
C
<lang c>#include <stdio.h>
- include <stdlib.h>
- include <stdbool.h>
- include <math.h>
- include <Judy.h>
bool distcheck(int (*dist)(), int n, double D) {
Pvoid_t h = (Pvoid_t) NULL; PWord_t value; PWord_t element;
Word_t i; int h_length;
// populate hashes for(i=0; i < n; i++) { int rn = dist(); JLI(value, h, rn); ++*value; }
JLC(h_length, h, 0, -1);
double target = 1.0 * n / (double)h_length;
i = 0; JLF(element, h, i); while(element != NULL) { if ( abs(*element - target) > 0.01*n*D ) { fprintf(stderr, "distribution potentially skewed for '%d': expected '%d', got '%d'\n",
i, (int)target, *element);
JudyLFreeArray(&h, PJE0); return false; // bad distr. } JLN(element, h, i); }
JudyLFreeArray(&h, PJE0); return true; // distr. ok
}
int main() {
distcheck(rand, 1000000, 1); return 0;
}</lang>
C++
<lang cpp>#include <map>
- include <iostream>
- include <cmath>
template<typename F>
bool test_distribution(F f, int calls, double delta)
{
typedef std::map<int, int> distmap; distmap dist;
for (int i = 0; i < calls; ++i) ++dist[f()];
double mean = 1.0/dist.size();
bool good = true;
for (distmap::iterator i = dist.begin(); i != dist.end(); ++i) { if (std::abs((1.0 * i->second)/calls - mean) > delta) { std::cout << "Relative frequency " << i->second/(1.0*calls) << " of result " << i->first << " deviates by more than " << delta << " from the expected value " << mean << "\n"; good = false; } }
return good;
}</lang>
Common Lisp
<lang lisp>(defun check-distribution (function n &optional (delta 1.0))
(let ((bins (make-hash-table))) (loop repeat n do (incf (gethash (funcall function) bins 0))) (loop with target = (/ n (hash-table-count bins)) for key being each hash-key of bins using (hash-value value) when (> (abs (- value target)) (* 0.01 delta n)) do (format t "~&Distribution potentially skewed for ~w:~ expected around ~w got ~w." key target value) finally (return bins))))</lang>
> (check-distribution 'd7 1000) Distribution potentially skewed for 1: expected around 1000/7 got 153. Distribution potentially skewed for 2: expected around 1000/7 got 119. Distribution potentially skewed for 3: expected around 1000/7 got 125. Distribution potentially skewed for 7: expected around 1000/7 got 156. T #<EQL Hash Table{7} 200B5A53> > (check-distribution 'd7 10000) NIL #<EQL Hash Table{7} 200CB5BB>
D
<lang d>import std.math: abs; import std.string: format; import std.stdio: writefln;
/** Bin the answers to fn() and check bin counts are within +/- delta % of repeats/bincount.
- /
void distCheck(TF)(TF func, int nrepeats, double delta) {
int[int] freqs; for (int i; i < nrepeats; i++) freqs[func()]++; double target = nrepeats / cast(double)freqs.length; int deltaCount = cast(int)(delta / 100.0 * target);
foreach (k, count; freqs) if (abs(target - count) >= deltaCount) throw new Exception(format("distribution potentially skewed for '%s': '%d'\n", k, count));
writefln(freqs);
}</lang>
Haskell
<lang haskell>import System.Random import Data.List import Control.Monad import Control.Arrow
distribCheck :: IO Int -> Int -> Int -> IO [(Int,(Int,Bool))] distribCheck f n d = do
nrs <- replicateM n f let group = takeWhile (not.null) $ unfoldr (Just. (partition =<< (==). head)) nrs avg = (fromIntegral n) / fromIntegral (length group) ul = round $ (100 + fromIntegral d)/100 * avg ll = round $ (100 - fromIntegral d)/100 * avg return $ map (head &&& (id &&& liftM2 (&&) (>ll)(<ul)).length) group</lang>
Example: <lang haskell>*Main> mapM_ print .sort =<< distribCheck (randomRIO(1,6)) 100000 3 (1,(16911,True)) (2,(16599,True)) (3,(16670,True)) (4,(16624,True)) (5,(16526,True)) (6,(16670,True))</lang>
J
This solution defines the checker as an adverb. Adverbs combine with the verb immediately to their left to create a new verb. So for a verb generateDistribution
and an adverb checkUniform
, the new verb might be thought of as checkGeneratedDistributionisUniform
.
The delta is given as an optional left argument (x
), defaulting to 5%. The right argument (y
) is any valid argument to the distribution generating verb.
<lang j>checkUniform=: adverb define
0.05 u checkUniform y : n=. */y delta=. x sample=. u n NB. the "u" refers to the verb to left of adverb freqtable=. /:~ (~. sample) ,. #/.~ sample expected=. n % # freqtable errmsg=. 'Distribution is potentially skewed' errmsg assert (delta * expected) > | expected - {:"1 freqtable freqtable
)</lang>
It is possible to use tacit expressions within an explicit definition enabling a more functional and concise style: <lang j>checkUniformT=: adverb define
0.05 u checkUniformT y : freqtable=. /:~ (~. ,. #/.~) u n=. */y errmsg=. 'Distribution is potentially skewed' errmsg assert ((n % #) (x&*@[ > |@:-) {:"1) freqtable freqtable
)</lang>
Show usage using rollD7t
given in Seven-dice from Five-dice:
<lang j> 0.05 rollD7t checkUniform 1e5 1 14082 2 14337 3 14242 4 14470 5 14067 6 14428 7 14374
0.05 rollD7t checkUniform 1e2
|Distribution is potentially skewed: assert | errmsg assert(delta*expected)>|expected-{:"1 freqtable</lang>
JavaScript
<lang javascript>function distcheck(random_func, times, opts) {
if (opts === undefined) opts = {} opts['delta'] = opts['delta'] || 2;
var count = {}, vals = []; for (var i = 0; i < times; i++) { var val = random_func(); if (! has_property(count, val)) { count[val] = 1; vals.push(val); } else count[val] ++; } vals.sort(function(a,b) {return a-b});
var target = times / vals.length; var tolerance = target * opts['delta'] / 100;
for (var i = 0; i < vals.length; i++) { var val = vals[i]; if (Math.abs(count[val] - target) > tolerance) throw "distribution potentially skewed for " + val + ": expected result around " + target + ", got " +count[val]; else print(val + "\t" + count[val]); }
}
function has_property(obj, propname) {
return typeof(obj[propname]) == "undefined" ? false : true;
}
try {
distcheck(function() {return Math.floor(10 * Math.random())}, 100000); print(); distcheck(function() {return (Math.random() > 0.95 ? 1 : 0)}, 100000);
} catch (e) {
print(e);
}</lang>
output
0 9945 1 9862 2 9954 3 10104 4 9861 5 10140 6 10066 7 10001 8 10101 9 9966 distribution potentially skewed for 0: expected result around 50000, got 95040
OCaml
<lang ocaml>let distcheck fn n ?(delta=1.0) () =
let h = Hashtbl.create 5 in for i = 1 to n do let v = fn() in let n = try Hashtbl.find h v with Not_found -> 0 in Hashtbl.replace h v (n+1) done; Hashtbl.iter (fun v n -> Printf.printf "%d => %d\n%!" v n) h; let target = (float n) /. float (Hashtbl.length h) in Hashtbl.iter (fun key value -> if abs_float(float value -. target) > 0.01 *. delta *. (float n) then (Printf.eprintf "distribution potentially skewed for '%d': expected around %f, got %d\n%!" key target value) ) h;
- </lang>
PicoLisp
The following function takes a count, and allowed deviation in per mill (one-tenth of a percent), and a 'prg' code body (i.e. an arbitrary number of executable expressions). <lang PicoLisp>(de checkDistribution (Cnt Pm . Prg)
(let Res NIL (do Cnt (accu 'Res (run Prg 1) 1)) (let (N (/ Cnt (length Res)) Min (*/ N (- 1000 Pm) 1000) Max (*/ N (+ 1000 Pm) 1000) ) (for R Res (prinl (cdr R) " " (if (>= Max (cdr R) Min) "Good" "Bad")) ) ) ) )</lang>
Output:
: (checkDistribution 100000 5 (rand 1 7)) 14299 Good 14394 Bad 14147 Bad 14418 Bad 14159 Bad 14271 Good 14312 Good
PureBasic
<lang PureBasic>Prototype RandNum_prt()
Procedure.s distcheck(*function.RandNum_prt, repetitions, delta.d)
Protected text.s, maxIndex = 0 Dim bucket(maxIndex) ;array will be resized as needed For i = 1 To repetitions ;populate buckets v = *function() If v > maxIndex maxIndex = v Redim bucket(maxIndex) EndIf bucket(v) + 1 Next lbnd = Round((repetitions / maxIndex) * (100 - delta) / 100, #PB_Round_Up) ubnd = Round((repetitions / maxIndex) * (100 + delta) / 100, #PB_Round_Down) text = "Distribution check:" + #crlf$ + #crlf$ text + "Total elements = " + Str(repetitions) + #crlf$ + #crlf$ text + "Margin = " + StrF(delta, 2) + "% --> Lbound = " + Str(lbnd) + ", Ubound = " + Str(ubnd) + #crlf$ For i = 1 To maxIndex If bucket(i) < lbnd Or bucket(i) > ubnd text + #crlf$ + "Bucket " + Str(i) + " contains " + Str(bucket(i)) + " elements. Skewed." EndIf Next ProcedureReturn text
EndProcedure
MessageRequester("Results", distcheck(@dice7(), 1000000, 0.20))</lang> A small delta was chosen to increase the chance of a skewed result in the sample output:
Distribution check: Total elements = 1000000 Margin = 0.20% --> Lbound = 142572, Ubound = 143142 Bucket 1 contains 141977 elements. Skewed. Bucket 6 contains 143860 elements. Skewed.
Python
<lang python>from collections import Counter from pprint import pprint as pp
def distcheck(fn, repeats, delta):
\ Bin the answers to fn() and check bin counts are within +/- delta % of repeats/bincount bin = Counter(fn() for i in range(repeats)) target = repeats // len(bin) deltacount = int(delta / 100. * target) assert all( abs(target - count) < deltacount for count in bin.values() ), "Bin distribution skewed from %i +/- %i: %s" % ( target, deltacount, [ (key, target - count) for key, count in sorted(bin.items()) ] ) pp(dict(bin))</lang>
Sample output:
>>> distcheck(dice5, 1000000, 1) {1: 200244, 2: 199831, 3: 199548, 4: 199853, 5: 200524} >>> distcheck(dice5, 1000, 1) Traceback (most recent call last): File "<pyshell#30>", line 1, in <module> distcheck(dice5, 1000, 1) File "C://Paddys/rand7fromrand5.py", line 54, in distcheck for key, count in sorted(bin.items()) ] AssertionError: Bin distribution skewed from 200 +/- 2: [(1, 4), (2, -33), (3, 6), (4, 11), (5, 12)]
R
<lang r>distcheck <- function(fn, repetitions=1e4, delta=3) {
if(is.character(fn)) { fn <- get(fn) } if(!is.function(fn)) { stop("fn is not a function") } samp <- fn(n=repetitions) counts <- table(samp) expected <- repetitions/length(counts) lbound <- expected * (1 - 0.01*delta) ubound <- expected * (1 + 0.01*delta) status <- ifelse(counts < lbound, "under", ifelse(counts > ubound, "over", "okay")) data.frame(value=names(counts), counts=as.vector(counts), status=status)
} distcheck(dice7.vec)</lang>
Ruby
<lang ruby>def distcheck(n, delta=1)
unless block_given? raise ArgumentError, "pass a block to this method" end h = Hash.new(0) n.times {h[ yield ] += 1} target = 1.0 * n / h.length h.each do |key, value| if (value - target).abs > 0.01 * delta * n raise StandardError, "distribution potentially skewed for '#{key}': expected around #{target}, got #{value}" end end h.keys.sort.each {|k| print "#{k} #{h[k]} "} puts
end
if __FILE__ == $0
begin distcheck(100_000) {rand(10)} distcheck(100_000) {rand > 0.95} rescue StandardError => e p e end
end</lang>
output:
0 9986 1 9826 2 9861 3 10034 4 9876 5 10114 6 10329 7 9924 8 10123 9 9927 #<StandardError: distribution potentially skewed for 'false': expected around 50000.0, got 94841>
Tcl
<lang tcl>proc distcheck {random times {delta 1}} {
for {set i 0} {$i<$times} {incr i} {incr vals([uplevel 1 $random])} set target [expr {$times / [array size vals]}] foreach {k v} [array get vals] { if {abs($v - $target) > $times * $delta / 100.0} { error "distribution potentially skewed for $k: expected around $target, got $v" } } foreach k [lsort -integer [array names vals]] {lappend result $k $vals($k)} return $result
}</lang> Demonstration: <lang tcl># First, a uniformly distributed random variable puts [distcheck {expr {int(10*rand())}} 100000]
- Now, one that definitely isn't!
puts [distcheck {expr {rand()>0.95}} 100000]</lang> Which produces this output (error in red):
0 10003 1 9851 2 10058 3 10193 4 10126 5 10002 6 9852 7 9964 8 9957 9 9994
distribution potentially skewed for 0: expected around 50000, got 94873