-- | This module computes the histogram by the
-- specified data and strategy applied for such computing.
--
-- The code in this module is essentially based on the
-- package
-- by Mike Izbicki, who kindly agreed to re-license
-- his library under BSD3, which allowed me to use his
-- code and comments with some modifications.
module Simulation.Aivika.Experiment.Histogram
(-- * Creating Histograms
Histogram(..),
histogram,
histogramBinSize,
histogramNumBins,
-- * Binning Strategies
BinningStrategy(..),
binSturges,
binDoane,
binSqrt,
binScott) where
import Data.List
import Data.Monoid
import qualified Data.Map as Map
import Numeric
-------------------------------------------------------------------------------
-- | Holds all the information needed to plot the histogram
-- for a list of different series. Each series produces its
-- own item in the resuling @[Int]@ list that may contain
-- zeros.
type Histogram = [(Double, [Int])]
-------------------------------------------------------------------------------
-- Bin counters; check out http://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width
-- | The strategy applied to calculate the histogram bins.
type BinningStrategy = [Double] -> Int
-- | This is only a helper function to convert strategies that
-- specify bin width into strategies that specify the number of bins.
stratFromBinWidth :: [Double] -> Double -> Int
stratFromBinWidth xs h = ceiling $ (maximum xs - minimum xs) / h
-- | Sturges' binning strategy is the least computational work,
-- but recommended for only normal data.
binSturges :: BinningStrategy
binSturges xs = ceiling $ logBase 2 n + 1
where n = fromIntegral $ length xs
-- | Doane's binning strategy extends Sturges' for non-normal data.
-- It takes a little more time because it must calculate the kurtosis
-- (peakkiness) of the distribution.
binDoane :: BinningStrategy
binDoane xs = ceiling $ 1 + log n + log (1 + a * ((n/6)**(1/2)))
where
n = fromIntegral $ length xs
a = kurtosis xs
-- | Using the sqrt of the number of samples is not supported by any
-- theory, but is commonly used by excel and other histogram making software.
binSqrt :: BinningStrategy
binSqrt xs = round $ sqrt n
where
n = fromIntegral $ length xs
-- | Scott's rule is the optimal solution for normal data, but requires
-- more computation than Sturges'.
binScott :: BinningStrategy
binScott xs = stratFromBinWidth xs $ 3.5 * stddev xs / (n**(1/3))
where
n = fromIntegral $ length xs
-------------------------------------------------------------------------------
-- create the histogram data
-- | Creates a histogram by specifying the list of series. Call it with one of
-- the binning strategies that is appropriate to the type of data you have.
-- If you don't know, then try using 'binSturges'.
histogram :: BinningStrategy -> [[Double]] -> Histogram
histogram strat xs = histogramNumBins (strat $ concat xs) xs
-- | Create a histogram by specifying the exact bin size.
-- You probably don't want to use this function, and should use histogram
-- with an appropriate binning strategy.
histogramBinSize :: Double -> [[Double]] -> Histogram
histogramBinSize size = combineBins . map (histBins size . bin size)
-- | Create a histogram by the specified approximated number of bins.
-- You probably don't want to use this function, and should use
-- histogram with an appropriate binning strategy.
histogramNumBins :: Int -> [[Double]] -> Histogram
histogramNumBins n xs =
histogramBinSize size xs
where
size = fromIntegral (firstdigit diff) * (10 ** fromIntegral (exponent10 diff))
diff = if diff_test > 0
then diff_test
else 1
diff_test = (maximum (map maximum xs) -
minimum (map minimum xs)) / fromIntegral (max 1 n)
firstdigit dbl = floor $ dbl / (10 ** fromIntegral (exponent10 dbl))
exponent10 dbl = floor $ logBase 10 dbl
-- | It does all the binning for the histogram.
histBins :: Double -> [Double] -> [(Double, Int)]
histBins size xs = [ (head l, length l) | l <- group (sort xs) ]
-- | It "rounds" every number into the closest number below it
-- that is divisible by size.
bin :: Double -> [Double] -> [Double]
bin size xs @ [_] = xs
bin size xs = map (\x -> size * fromIntegral (floor (x / size)) + size / 2) xs
-- | Combine bins from different histograms (optimized version).
combineBins :: [[(Double, Int)]] -> [(Double, [Int])]
combineBins [xs] = map (\(t, n) -> (t, [n])) xs
combineBins xss = combineBins' xss
-- | Combine bins from different histograms (generic version).
combineBins' :: [[(Double, Int)]] -> [(Double, [Int])]
combineBins' xs = map (merging . sorting) $
groupping $ ordering $ concat $ numbering xs
where numbering = zipWith (curry indexing) [1..]
indexing (i, x) = map (\(t, n) -> (i, t, n)) x
ordering = sortBy $ \(_, t1, _) (_, t2, _) -> compare t1 t2
groupping = groupBy $ \(_, t1, _) (_, t2, _) -> t1 == t2
sorting = sortBy $ \(i1, _, _) (i2, _, _) -> compare i1 i2
merging zs @ ((_, t, _) : _) = merging' zs t 1 []
merging' [] t k acc
| k <= count = merging' [] t (k + 1) (0 : acc)
| k > count = (t, reverse acc)
merging' zs @ ((i, _, n) : xs) t k acc
| i == k = merging' xs t (k + 1) (n : acc)
| i > k = merging' zs t (k + 1) (0 : acc)
count = length xs
------------------------------------------------------------------------------
-- simple math functions
-- taken from package hstats, which wouldn't fully compile
-- | Numerically stable mean.
mean :: Floating a => [a] -> a
mean x = fst $ foldl' (\(m, n) x -> (m+(x-m)/(n+1),n+1)) (0,0) x
-- | Standard deviation of sample.
stddev :: (Floating a) => [a] -> a
stddev xs = sqrt $ var xs
-- | Sample variance.
var :: (Floating a) => [a] -> a
var xs = var' 0 0 0 xs / fromIntegral (length xs - 1)
where
var' _ _ s [] = s
var' m n s (x:xs) = var' nm (n + 1) (s + delta * (x - nm)) xs
where
delta = x - m
nm = m + delta / fromIntegral (n + 1)
-- | kurtosis is taken from wikipedia's definition.
kurtosis :: (Floating a) => [a] -> a
kurtosis xs = ((1/n) * sum [(x-x_bar)^4 | x <- xs])
/ ((1/n) * sum [(x-x_bar)^2 | x <- xs])^2 -3
where
n = fromIntegral $ length xs
x_bar = mean xs