CalcNaturalBreaksCode

This page contains C++ code of a \( O(k \times n \times log(n) ) \) method for the classification of an array of numeric values such that the variance per class is minimal, known as a Fisher's Natural Breaks Classification. This algorithm is an improvement of Jenks' Natural Breaks Optimization method, which is an \( O(k \times n^2) \) algorithm.

This code is part of the [GeoDms] and derived from clc/dll/src1/CalcClassBreaks.cpp in our Subversion Repository. The following independent compilable and runnable program that finds the class-breaks for an array of random numbers and made that available here.

This code is written by Maarten Hilferink, &copy; Object Vision BV, and is provided under GNU GPL v3.0 license

=code=
 * 1) include 
 * 2) include
 * 3) include

typedef std::size_t                 SizeT; typedef SizeT                       CountType; typedef std::pair ValueCountPair; typedef std::vector         LimitsContainer; typedef std::vector ValueCountPairContainer;

// helper funcs template  T Min(T a, T b) { return (a<b) ? a : b; }

SizeT GetTotalCount(const ValueCountPairContainer& vcpc) {	SizeT sum = 0; ValueCountPairContainer::const_iterator i = vcpc.begin, e = vcpc.end; for(sum = 0; i!=e; ++i) sum += (*i).second; return sum; }

// helper struct JenksFisher

struct JenksFisher // captures the intermediate data and methods for the calculation of Natural Class Breaks. {	JenksFisher(const ValueCountPairContainer& vcpc, SizeT k)		:	m_M(vcpc.size) ,	m_K(k) ,	m_BufSize(vcpc.size-(k-1)) ,	m_PrevSSM(m_BufSize) ,	m_CurrSSM(m_BufSize) ,	m_CB(m_BufSize * (m_K-1)) ,	m_CBPtr {		m_CumulValues.reserve(vcpc.size); double cwv=0; CountType cw = 0, w;

for(SizeT i=0; i!=m_M; ++i) {			assert(!i || vcpc[i].first > vcpc[i-1].first); // PRECONDITION: the value sequence must be strictly increasing

w  = vcpc[i].second; assert(w > 0); // PRECONDITION: all weights must be positive

cw += w; assert(cw > w || !i); // No overflow? No loss of precision?

cwv += w * vcpc[i].first; m_CumulValues.push_back(ValueCountPair(cwv, cw)); if (i < m_BufSize) m_PrevSSM[i] = cwv * cwv / cw; // prepare SSM for first class. Last (k-1) values are omitted }	}

double GetW(SizeT b, SizeT e) // Gets sum of weighs for elements b..e.	{ assert(b);   // First element always belongs to class 0, thus queries should never include it. assert(b<=e); assert(e<m_M);

double res = m_CumulValues[e].second; res -= m_CumulValues[b-1].second; return res; }

double GetWV(SizeT b, SizeT e)	// Gets sum of weighed values for elements b..e	{ assert(b); assert(b<=e); assert(e<m_M);

double res = m_CumulValues[e].first; res -= m_CumulValues[b-1].first; return res; }

double GetSSM(SizeT b, SizeT e) // Gets the Squared Mean for elements b..e, multiplied by weight. // Note that n*mean^2 = sum^2/n when mean := sum/n {		double res = GetWV(b,e); return res * res / GetW(b,e); }

SizeT FindMaxBreakIndex(SizeT i, SizeT bp, SizeT ep) // finds CB[i+m_NrCompletedRows] given that // the result is at least bp+(m_NrCompletedRows-1) // and less than         ep+(m_NrCompletedRows-1) // Complexity: O(ep-bp) <= O(m) {		assert(bp < ep); assert(bp <= i); assert(ep <= i+1); assert(i <  m_BufSize); assert(ep <= m_BufSize);

double minSSM = m_PrevSSM[bp] + GetSSM(bp+m_NrCompletedRows, i+m_NrCompletedRows); SizeT foundP = bp; while (++bp < ep) {			double currSSM = m_PrevSSM[bp] + GetSSM(bp+m_NrCompletedRows, i+m_NrCompletedRows); if (currSSM > minSSM) {				minSSM = currSSM; foundP = bp; }		}		m_CurrSSM[i] = minSSM; return foundP; }

void CalcRange(SizeT bi, SizeT ei, SizeT bp, SizeT ep) // find CB[i+m_NrCompletedRows] // for all i>=bi and i(ep, mi+1));

assert(bp <= mp); assert(mp < ep); assert(mp <= mi); // solve first half of the sub-problems with lower 'half' of possible outcomes CalcRange(bi, mi, bp, Min(mi, mp+1));

m_CBPtr[ mi ] = mp; // store result for the middle element.

// solve second half of the sub-problems with upper 'half' of possible outcomes CalcRange(mi+1, ei, mp, ep); }

void CalcAll // complexity: O(m*log(m)*k) {		if (m_K>=2) {			m_CBPtr = m_CB.begin; for (m_NrCompletedRows=1; m_NrCompletedRows              m_CB; std::vector::iterator    m_CBPtr;

SizeT                 m_NrCompletedRows; };

// GetValueCountPairs // // GetValueCountPairs sorts chunks of values and then merges them in order to minimize extra memory and work when many values are equal. // This is done recursively while retaining used intermediary buffers in order to minimize heap allocations.

const SizeT BUFFER_SIZE = 1024;

void GetCountsDirect(ValueCountPairContainer& vcpc, const double* values, SizeT size) {	assert(size <= BUFFER_SIZE); assert(size > 0); assert(vcpc.empty);

double buffer[BUFFER_SIZE];

std::copy(values, values+size, buffer); std::sort(buffer, buffer+size);

double currValue = buffer[0]; SizeT    currCount = 1; for (SizeT index = 1; index != size; ++index) {		if (currValue < buffer[index]) {			vcpc.push_back(ValueCountPair(currValue, currCount)); currValue = buffer[index]; currCount = 1; }		else ++currCount; }	vcpc.push_back(ValueCountPair(currValue, currCount)); }

struct CompareFirst {	bool operator (const ValueCountPair& lhs, const ValueCountPair& rhs) {		return lhs.first < rhs.first; } };

void MergeToLeft(ValueCountPairContainer& vcpcLeft, const ValueCountPairContainer& vcpcRight, ValueCountPairContainer& vcpcDummy) {	assert(vcpcDummy.empty); vcpcDummy.swap(vcpcLeft); vcpcLeft.resize(vcpcRight.size + vcpcDummy.size);

std::merge(vcpcRight.begin, vcpcRight.end, vcpcDummy.begin, vcpcDummy.end, vcpcLeft.begin, CompareFirst);

ValueCountPairContainer::iterator currPair = vcpcLeft.begin, lastPair = vcpcLeft.end;

ValueCountPairContainer::iterator index = currPair+1; while (index != lastPair && currPair->first < index->first) {		currPair = index; ++index; }

double currValue = currPair->first; SizeT    currCount = currPair->second; for (index != lastPair;++index) {		if (currValue < index->first) {			*currPair++ = ValueCountPair(currValue, currCount); currValue = index->first; currCount = index->second; }		else currCount += index->second; }	*currPair++ = ValueCountPair(currValue, currCount); vcpcLeft.erase(currPair, lastPair);

vcpcDummy.clear; }

struct ValueCountPairContainerArray : std::vector {	void resize(SizeT k)	{ assert(capacity >= k); while (size < k)		{ push_back(ValueCountPairContainer); back.reserve(BUFFER_SIZE); }	}

void GetValueCountPairs(ValueCountPairContainer& vcpc, const double* values, SizeT size, unsigned int nrUsedContainers) {		assert(vcpc.empty); if (size <= BUFFER_SIZE) GetCountsDirect(vcpc, values, size); else {			resize(nrUsedContainers+2);

unsigned int m = size/2;

GetValueCountPairs(vcpc, values, m, nrUsedContainers); GetValueCountPairs(begin[nrUsedContainers], values + m, size - m, nrUsedContainers+1);

MergeToLeft(vcpc, begin[nrUsedContainers], begin[nrUsedContainers+1]); begin[nrUsedContainers].clear; }		assert(GetTotalCount(vcpc) == size); } };

void GetValueCountPairs(ValueCountPairContainer& vcpc, const double* values, SizeT n) { vcpc.clear;

if(n) {		ValueCountPairContainerArray vcpca; // max nr halving is log2(max cardinality / BUFFER_SIZE); max cardinality is SizeT(-1) vcpca.reserve(3+8*sizeof(SizeT)-10); vcpca.GetValueCountPairs(vcpc, values, n, 0);

assert(vcpc.size); } }

void ClassifyJenksFisherFromValueCountPairs(LimitsContainer& breaksArray, SizeT k, const ValueCountPairContainer& vcpc) {	breaksArray.resize(k); SizeT m = vcpc.size;

assert(k <= m); // PRECONDITION

if (!k) return;

JenksFisher jf(vcpc, k);

if (k > 1) {		jf.CalcAll;

SizeT lastClassBreakIndex = jf.FindMaxBreakIndex(jf.m_BufSize-1, 0, jf.m_BufSize);

while (--k) {			breaksArray[k] = vcpc[lastClassBreakIndex+k].first; assert(lastClassBreakIndex < jf.m_BufSize); if (k > 1) {				jf.m_CBPtr -= jf.m_BufSize; lastClassBreakIndex = jf.m_CBPtr[lastClassBreakIndex]; }		}		assert(jf.m_CBPtr == jf.m_CB.begin); }	assert( k == 0 ); breaksArray[0] = vcpc[0].first; // break for the first class is the minimum of the dataset. }

// test code


 * 1) include "boost/random.hpp"

int main(int c, char** argv) { 	const double rangeMin = 0.0; const double rangeMax = 10.0; typedef boost::uniform_real NumberDistribution; typedef boost::mt19937 RandomNumberGenerator; typedef boost::variate_generator Generator;

NumberDistribution distribution(rangeMin, rangeMax); RandomNumberGenerator generator; generator.seed(0); // seed with the current time Generator numberGenerator(generator, distribution); const int n = 1000000; const int k = 10;

std::cout << "Generating random numbers..." << std::endl;

std::vector values; values.reserve(n); for (int i=0; i!=n; ++i) {		double v = numberGenerator; values.push_back(v*v); //populate a distribuiton slightly more interesting than uniform, with a lower density at higher values. }	assert(values.size == n);

std::cout << "Generating sortedUniqueValueCounts ..." << std::endl; ValueCountPairContainer sortedUniqueValueCounts; GetValueCountPairs(sortedUniqueValueCounts, &values[0], n);

std::cout << "Finding Jenks ClassBreaks..." << std::endl; LimitsContainer resultingbreaksArray; ClassifyJenksFisherFromValueCountPairs(resultingbreaksArray, k, sortedUniqueValueCounts);

std::cout << "Reporting results..." << std::endl; for (double breakValue: resultingbreaksArray) std::cout << breakValue << std::endl << std::endl; std::cout << "Press a char and to terminate" << std::endl; char ch; std::cin >> ch; // wait for user to enter a key } // main