// --*- C++ -*------x---------------------------------------------------------
// $Id: CompensationScorer.cc,v 1.1.1.1 2006/07/03 14:43:19 bindewae Exp $
//
// Class:           CompensationScorer
// 
// Base class:      -
//
// Derived classes: - 
//
// Author:          Eckart Bindewald
//
// Project name:    -
//
// Date:            $Date: 2006/07/03 14:43:19 $
//
// Description:     - 
// -----------------x-------------------x-------------------x-----------------

#include <CompensationScorer.h>
#include <generalNumerics.h>
#include <Random.h>
#include <vectornumerics.h>
#include <Limits.h> 
#include <stdio.h>

const unsigned int MIN_ALIGNED = 2; // this many characters have to be aligned

// ---------------------------------------------------------------------------
//                                   CompensationScorer
// -----------------x-------------------x-------------------x-----------------

/* CONSTRUCTORS */

/* default constructor */
CompensationScorer::CompensationScorer() : algorithm(14), verboseLevel(1),
					   compMin(0.75),
					   energyWeight(0.0),
					   entropyStdMax(0.01),
					   entropyWeight(1.0),
					   gapFracMin(0.0),
					   threshold(0.5),
					   userMean(0.0),
					   userDev(1.0),
					   alphabet(RNA_ALPHABET),
					   compatibilityExponent(1.0),
					   randomSampleNum(100),
					   randomSampleSquareNum(500)
{
  allowedPairs = Vec<string>(6);
  allowedPairs[0] = "GC";
  allowedPairs[1] = "CG";
  allowedPairs[2] = "AU";
  allowedPairs[3] = "UA";
  allowedPairs[4] = "GU";
  allowedPairs[5] = "UG";
  freqLookUpGaps = 0;
  freqLookUpNorm = 0.0;
  freqLookUp = Vec<Vec<double> >(256, Vec<double>(256, 0.0));
  baseFrequencies = Vec<double>(alphabet.size(), 
			   1.0 / static_cast<double>(alphabet.size()));

}

/* copy constructor */
CompensationScorer::CompensationScorer(const CompensationScorer& other)
{
  copy(other);
}

/* destructor */
CompensationScorer::~CompensationScorer() { }


/* OPERATORS */

/** Assigment operator. */
CompensationScorer& 
CompensationScorer::operator = (const CompensationScorer& orig)
{
  if ((&orig) != this) {
    copy(orig);
  }
  return *this;
}

ostream& 
operator << (ostream& os, const CompensationScorer& rval)
{
  os << rval.alphabet << " " << rval.allowedPairs << " " 
     << rval.compatibilityExponent;
  return os;
}

istream& 
operator >> (istream& is, CompensationScorer& rval)
{
  is >> rval.alphabet >> rval.allowedPairs
     >> rval.compatibilityExponent;
  return is;
}

/* PREDICATES */

/** Is current state valid?

    @author Eckart Bindewald
    @return true <=> current state is valid 
*/
bool
CompensationScorer::isValid() const
{
  return alphabet.size() > 0;
}


/** returns true, if c1 and c2 are found in allowedPairs */
bool 
CompensationScorer::isAllowedPair(char c1, char c2) const
{
  if ((c1 == GAP_CHAR) || (c2 == GAP_CHAR)) {
    return false;
  }
  for (unsigned int i = 0; i < allowedPairs.size(); ++i) {
    ASSERT(allowedPairs[i].size() == 2);
    if (((allowedPairs[i][0] == c1) && (allowedPairs[i][1] == c2))
	|| ((allowedPairs[i][0] == c2) && (allowedPairs[i][1] == c1))) {
      return true;
    }
  }
  return false;
}

/** returns true, if c1 and c2 are found in allowedPairs */
unsigned int 
CompensationScorer::numAllowedPair(char c1, char c2) const
{
  for (unsigned int i = 0; i < allowedPairs.size(); ++i) {
    ASSERT(allowedPairs[i].size() == 2);
    if (((allowedPairs[i][0] == c1) && (allowedPairs[i][1] == c2))
	|| ((allowedPairs[i][0] == c2) && (allowedPairs[i][1] == c1))) {
      return i;
    }
  }
  return allowedPairs.size();
}

/** counts number of matching nucleotides */
unsigned int
CompensationScorer::countAllowedPairs(const string& s1, const string& s2) const
{
  PRECOND(s1.size() == s2.size());
  unsigned int result = 0;
  for (unsigned int i = 0; i < s1.size(); ++i) {
    if (isAllowedPair(s1[i], s2[i])) {
      ++result;
    }
  }
  return result;
}

/** counts number of matching nucleotides,
    given a randomly renamed definition of alloed pairs! */
unsigned int
CompensationScorer::countRandomRenamedAllowedPairs(string s1, 
						   string s2) const
{
  PRECOND(s1.size() == s2.size());
  Random& rnd = Random::getInstance();
  string randomAlphabet = alphabet;
  random_shuffle(randomAlphabet.begin(), randomAlphabet.end(), rnd);
  unsigned int nAlpha = alphabet.size();
  // translate string according to new randomized alphabet:
  for (unsigned int i = 0; i < s1.size(); ++i) {
    for (unsigned int j = 0; j < nAlpha; ++j) {
      if (s1[i] == alphabet[j]) {
	s1[i] = randomAlphabet[j];
      }
    }
    for (unsigned int j = 0; j < nAlpha; ++j) {
      if (s2[i] == alphabet[j]) {
	s2[i] = randomAlphabet[j];
      }
    }
  }
  unsigned int result = 0;
  for (unsigned int i = 0; i < s1.size(); ++i) {
    if (isAllowedPair(s1[i], s2[i])) {
      ++result;
    }
  }
  return result;
}

/** counts number of matching nucleotides,
    given average over a randomly renamed definition of allowed pairs! */
double
CompensationScorer::averageRandomRenamedAllowedPairs(const string& s1Orig, 
						     const string& s2Orig) const
{
  PRECOND(s1Orig.size() == s2Orig.size());
  string randomAlphabet = alphabet;
  unsigned int sum = 0;
  unsigned int counter = 0;
  unsigned int result = 0;
  string s1 = s1Orig;
  string s2 = s2Orig;
  unsigned int limitCount = countAllowedPairs(s1Orig, s2Orig);
  while (next_permutation(randomAlphabet.begin(), randomAlphabet.end())) {
    unsigned int nAlpha = alphabet.size();
    // translate string according to new randomized alphabet:
    for (unsigned int i = 0; i < s1.size(); ++i) {
      for (unsigned int j = 0; j < nAlpha; ++j) {
	if (s1Orig[i] == alphabet[j]) {
	  s1[i] = randomAlphabet[j];
	}
      }
      for (unsigned int j = 0; j < nAlpha; ++j) {
	if (s2Orig[i] == alphabet[j]) {
	  s2[i] = randomAlphabet[j];
	}
      }
    }
    result = countAllowedPairs(s1, s2);
    if (result >= limitCount) {
      ++sum;
    }
    ++counter;
  }
  return static_cast<double>(sum)/static_cast<double>(counter);
}

void
CompensationScorer::generateRandomSequence(string& result,
					   const string& alph,
					   Random& rnd)
{
  for (unsigned int i = 0; i < result.size(); ++i) {
    result[i] = alph[rnd.getRand() % alph.size()];
  }
}

/** counts number of pairs with at least one gap */
unsigned int
CompensationScorer::countGaps(const string& s1, const string& s2, char gapChar) const
{
  PRECOND(s1.size() == s2.size());
  unsigned int result = 0;
  for (unsigned int i = 0; i < s1.size(); ++i) {
    if ((s1[i] == gapChar) || (s2[i] == gapChar)) {
      ++result;
    }
  }
  return result;
}

/** returns 1 for highly reliable prediction,
    0 for totally uncompatible prediction */
double
CompensationScorer::frequency(const string& col, char c) {
  if (col.size() == 0) {
    return 0.0;
  }
  unsigned int len = 0;
  unsigned int num = 0;
  for (unsigned int i = 0; i < col.size(); ++i) {
//     if (alphabet.find(col[i]) >= alphabet.size()) {
//       continue; // ignore cases which have bad letter (including dash, "y" etc)
//     }
    if (col[i] == c) {
      ++num;
    }
    ++len;
  }
  if (len == 0) {
    return 0.0;
  }
  // unsigned int num = count(col.begin(), col.end(), c);
  return static_cast<double>(num) / len;
}

/** returns 1 for highly reliable prediction,
    0 for totally uncompatible prediction */
double
CompensationScorer::frequency(const string& col, char c, const Vec<double>& wVec) {
  if (col.size() == 0) {
    return 0.0;
  }
  double num = 0;
  double normSum = 0.0;
  for (unsigned int i = 0; i < col.size(); ++i) {
//     if (alphabet.find(col[i]) >= alphabet.size()) {
//       continue; // ignore cases which have bad letter (including dash, "y" etc)
//     }
    normSum += wVec[i];
    if (col[i] == c) {
      num += wVec[i];
    }
  }
  if (normSum <= 0.0) {
    return 0.0;
  }
  // unsigned int num = count(col.begin(), col.end(), c);
  return num / normSum;
}

/** returns frequency of a certain pair */
double
CompensationScorer::frequency(const string& col1, const string& col2, 
			      char c1, char c2)
{
  unsigned int minSize = col2.size();
  if (col1.size() < col2.size()) {
    minSize = col1.size();
  }
  if (minSize == 0) {
    return 0.0;
  }
  unsigned int num = 0;
  for (unsigned int i = 0; i < minSize; ++i) {
    if ((col1[i] == c1) && (col2[i] == c2)) {
      ++num;
    }
  }
  // unsigned int num = count(col.begin(), col.end(), c);
  return static_cast<double>(num) / minSize;
}

/** returns frequency of a certain pair */
double
CompensationScorer::frequency(const string& col1, const string& col2, char c1, char c2,
			      const Vec<double>& wVec)
{
  unsigned int minSize = col2.size();
  if (col1.size() < col2.size()) {
    minSize = col1.size();
  }
  if (minSize == 0) {
    return 0.0;
  }
  double num = 0;
  double normSum = 0.0;
  for (unsigned int i = 0; i < minSize; ++i) {
    normSum += wVec[i];
    if ((col1[i] == c1) && (col2[i] == c2)) {
      num += wVec[i];
    }
  }
  if (normSum <= 0.0) {
    return 0.0;
  }
  // unsigned int num = count(col.begin(), col.end(), c);
  return num / normSum;
}


/** frequence of charactor in column, divide by number of non-gap rows */
double
CompensationScorer::frequency2(const string& col, char c) {
  if (col.size() == 0) {
    return 0.0;
  }
  unsigned int num = 0;
  unsigned int numGap = 0;
  for (unsigned int i = 0; i < col.size(); ++i) {
//     if (alphabet.find(col[i]) >= alphabet.size()) {
//       continue; // ignore cases which have bad letter (including dash, "y" etc)
//     }
    if (col[i] == GAP_CHAR) {
      ++numGap;
    }
    else if (col[i] == c) {
      ++num;
    }
  }
  if (numGap >= col.size()) {
    return 0.0;
  }
  // unsigned int num = count(col.begin(), col.end(), c);
  return static_cast<double>(num) / static_cast<double>(col.size()-numGap);
}

/** frequence of charactor in column, divide by number of non-gap rows */
double
CompensationScorer::frequency2(const string& col, char c, const Vec<double>& wVec) {
  if (col.size() == 0) {
    return 0.0;
  }
  double num = 0.0;
  unsigned int numGap = 0;
  double normSum = 0.0;
  for (unsigned int i = 0; i < col.size(); ++i) {
    //     if (alphabet.find(col[i]) >= alphabet.size()) {
    //       continue; // ignore cases which have bad letter (including dash, "y" etc)
    //     }
    if (col[i] == GAP_CHAR) {
      ++numGap;
    }
    else {
      normSum += wVec[i];
      if (col[i] == c) {
	num += wVec[i]; // use weight vector instead
	// ++num;
      }
    }
  }
  if (normSum <= 0.0) {
    return 0.0;
  }
  // unsigned int num = count(col.begin(), col.end(), c);
  return num / normSum;
}

/** frequence of charactor in column, divide by number of non-gap rows,
 using pseudocounts top enumerator and divisor */
double
CompensationScorer::frequency3(const string& col, char c,
			       unsigned int pseudoTop,
			       unsigned int pseudoBottom) {
  if (col.size() == 0) {
    return 0.0;
  }
  unsigned int num = 0;
  unsigned int numGap = 0;
  for (unsigned int i = 0; i < col.size(); ++i) {
//     if (alphabet.find(col[i]) >= alphabet.size()) {
//       continue; // ignore cases which have bad letter (including dash, "y" etc)
//     }
    if (col[i] == GAP_CHAR) {
      ++numGap;
    }
    else if (col[i] == c) {
      ++num;
    }
  }
  if (numGap >= col.size()) {
    return 0.0;
  }
  // unsigned int num = count(col.begin(), col.end(), c);
  return static_cast<double>(num+pseudoTop) 
    / static_cast<double>((col.size()+pseudoBottom)-numGap);
}

/** frequence of charactor in column, divide by number of non-gap rows,
 using pseudocounts top enumerator and divisor */
double
CompensationScorer::frequency3(const string& col, char c,
			       unsigned int pseudoTop,
			       unsigned int pseudoBottom,
			       const Vec<double>& wVec) {
  if (col.size() == 0) {
    return 0.0;
  }
  double num = 0.0;
  double normSum = 0.0;
  for (unsigned int i = 0; i < col.size(); ++i) {
//     if (alphabet.find(col[i]) >= alphabet.size()) {
//       continue; // ignore cases which have bad letter (including dash, "y" etc)
//     }
    if (col[i] != GAP_CHAR) {
      normSum += wVec[i];
      if (col[i] == c) {
	num += wVec[i];
      }
    }
  }
  if ((normSum + pseudoBottom) <= 0.0) {
    return 0.0;
  }
  // unsigned int num = count(col.begin(), col.end(), c);
  return ( num+pseudoTop ) / (normSum + pseudoBottom);
}


/** returns frequency of a certain pair, divide by number of non gap rows */
double
CompensationScorer::frequency2(const string& col1, 
			       const string& col2, char c1, char c2 ) 
{
  unsigned int minSize = col2.size();
  if (col1.size() < col2.size()) {
    minSize = col1.size();
  }
  if (minSize == 0) {
    return 0.0;
  }
  unsigned int num = 0;
  unsigned int numGap = 0;
  for (unsigned int i = 0; i < minSize; ++i) {
//     if ((alphabet.find(col1[i]) >= alphabet.size())
// 	|| (alphabet.find(col2[i]) >= alphabet.size() ) ) {
//       continue; // ignore cases which have bad letter (including dash, "y" etc)
//     }
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++numGap;
    }
    else if ((col1[i] == c1) && (col2[i] == c2)) {
      ++num;
    }
  }
  if (numGap >= minSize) {
    return 0.0;
  }
  // unsigned int num = count(col.begin(), col.end(), c);
  return static_cast<double>(num) / static_cast<double>(minSize-numGap);
}

/** returns frequency of a certain pair, divide by number of non gap rows */
double
CompensationScorer::frequency2(const string& col1, 
			       const string& col2, char c1, char c2,
			       const Vec<double>& wVec) 
{
  unsigned int minSize = col2.size();
  if (col1.size() < col2.size()) {
    minSize = col1.size();
  }
  if (minSize == 0) {
    return 0.0;
  }
  double num = 0.0;
  unsigned int numGap = 0;
  double normSum = 0.0;
  for (unsigned int i = 0; i < minSize; ++i) {
//     if ((alphabet.find(col1[i]) >= alphabet.size())
// 	|| (alphabet.find(col2[i]) >= alphabet.size() ) ) {
//       continue; // ignore cases which have bad letter (including dash, "y" etc)
//     }
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++numGap;
    }
    else { 
      normSum += wVec[i];
      if ((col1[i] == c1) && (col2[i] == c2)) {
	num += wVec[i];
	// ++num;
      }
    }
  }
  if (normSum <= 0.0) {
    return 0.0;
  }
  // unsigned int num = count(col.begin(), col.end(), c);
  return num / normSum;
}

/** returns frequency of a certain pair, divide by number of non gap rows */
double
CompensationScorer::frequency3(const string& col1, 
			       const string& col2, char c1, char c2,
			       unsigned int pseudoTop,
			       unsigned int pseudoBottom)
{
  unsigned int minSize = col2.size();
  if (col1.size() < col2.size()) {
    minSize = col1.size();
  }
  if (minSize == 0) {
    return 0.0;
  }
  unsigned int num = 0;
  unsigned int numGap = 0;
  for (unsigned int i = 0; i < minSize; ++i) {
//     if ((alphabet.find(col1[i]) >= alphabet.size())
// 	|| (alphabet.find(col2[i]) >= alphabet.size() ) ) {
//       continue; // ignore cases which have bad letter (including dash, "y" etc)
//     }
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++numGap;
    }
    else if ((col1[i] == c1) && (col2[i] == c2)) {
      ++num;
    }
  }
  if (numGap >= minSize) {
    return 0.0;
  }
  // unsigned int num = count(col.begin(), col.end(), c);
  return static_cast<double>(num+pseudoTop) 
    / static_cast<double>((minSize+pseudoBottom)-numGap);
}

/** returns frequency of a certain pair, divide by number of non gap rows */
double
CompensationScorer::frequency3(const string& col1, 
			       const string& col2, char c1, char c2,
			       unsigned int pseudoTop,
			       unsigned int pseudoBottom,
			       const Vec<double>& wVec)
{
  unsigned int minSize = col2.size();
  if (col1.size() < col2.size()) {
    minSize = col1.size();
  }
  if (minSize == 0) {
    return 0.0;
  }
  double num = 0.0;
  double normSum = 0.0;
  unsigned int numGap = 0;
  for (unsigned int i = 0; i < minSize; ++i) {
//     if ((alphabet.find(col1[i]) >= alphabet.size())
// 	|| (alphabet.find(col2[i]) >= alphabet.size() ) ) {
//       continue; // ignore cases which have bad letter (including dash, "y" etc)
//     }
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++numGap;
    }
    else { 
      normSum += wVec[i];
      if ((col1[i] == c1) && (col2[i] == c2)) {
	num += wVec[i];
      }
    }
  }
  if ((normSum + pseudoBottom) <= 0.0) {
    return 0.0;
  }
  // unsigned int num = count(col.begin(), col.end(), c);
  return ( num+pseudoTop ) / (normSum + pseudoBottom);

}

/** returns 1 for highly reliable prediction,
    0 for totally uncompatible prediction */
double
CompensationScorer::compatibilityScore(const string& col1,
				       const string& col2) const {
  PRECOND(col1.size() == col2.size());
  PRECOND(col1.size() > 0);
  unsigned int numAllowed = 0;
  unsigned int skipCount = 0;
  int pairId = 0;
  // cout << " hi" << endl;
  Vec<int> pairCounters(allowedPairs.size(), 0);
  Vec<Vec<int> > totalPairCounters(alphabet.size(), 
				   Vec<int>(alphabet.size(), 0U));
  for (unsigned int i = 0; i < col1.size(); ++i) {
    unsigned int id1 = letterId(col1[i]);
    unsigned int id2 = letterId(col2[i]);
    if (skipPair(col1[i], col2[i])) {
      ++skipCount;
      continue; // example: both have "-" character
    }
    else if ((pairId = numAllowedPair(col1[i], col2[i]))
	     < static_cast<int>(allowedPairs.size())) {
      ASSERT(pairId < static_cast<int>(pairCounters.size()));
      ++(pairCounters[pairId]);
      // ++numAllowed;
    }
    totalPairCounters[id1][id2] = 1U;
  }
  int totalPairSum = 0;
  // counter total number of *different* pairs
  for (unsigned int i = 0; i < totalPairCounters.size(); ++i) {
    for (unsigned int j = 0; j < totalPairCounters[i].size(); ++j) {
      totalPairSum += totalPairCounters[i][j];
    }
  }
  int nnMax = alphabet.size() * alphabet.size(); // total number of possible pairs
  if (totalPairSum == 0) {
    totalPairSum = 1;
  }
  int nn = totalPairSum;
  
  // chance probability of one good pair:
  const double p0 = allowedPairs.size() / static_cast<double>(nnMax);
  // Binomial distribution:
  for (unsigned int i = 0; i < pairCounters.size(); ++i) {
    if (pairCounters[i] > 0) {
      ++numAllowed; // count each pair type only once
    }
  }    
  //     double relAllowed = static_cast<double>(numAllowed) 
  //       / (col1.size() - skipCount);
  //     double result = pow(relAllowed, compatibilityExponent);
  double result = nOverK(nn, numAllowed)
    * pow(p0, static_cast<double>(numAllowed)) * pow(1.0 - p0, 
						     static_cast<double>(nn - numAllowed));
  if (verboseLevel > 2) {
    cout << "Score of col1, col2: " << result << " " 
	 << col1 << " " << col2 << " " << endl;
    }
  return result;
}

/** returns 1 for highly reliable prediction,
    0 for totally uncompatible prediction.
    Returns fraction of pairs that are compatible
*/
double
CompensationScorer::compatibilityScore2(const string& col1,
					const string& col2) const {
  PRECOND(col1.size() == col2.size());
  PRECOND(col1.size() > 0);
  if (col1.size() == 0) {
    return 0.0;
  }
  int numCorrect = 0;
  int numSkip = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++numSkip;
      continue;
    }
    else if (isAllowedPair(col1[i], col2[i])) {
      ++numCorrect;
    }
  }
  if (numSkip >= static_cast<int>(col1.size())) {
    return 0.0;
  }
  return static_cast<double>(numCorrect)/(col1.size()-numSkip);
}

/** returns 1 for highly reliable prediction,
    0 for totally uncompatible prediction.
    Returns fraction of pairs that are compatible, weighted by binomial distribution
*/
double
CompensationScorer::compatibilityScore3(const string& col1,
					const string& col2) const {
  PRECOND(col1.size() == col2.size());
  PRECOND(col1.size() > 0);
  // cout << "Calling compatibilityScore3!" << endl;
  if (col1.size() == 0) {
    return 0.0;
  }
  int numCorrect = 0;
  int numSkip = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++numSkip;
      continue;
    }
    else if (isAllowedPair(col1[i], col2[i])) {
      ++numCorrect;
    }
  }
  if (numSkip >= static_cast<int>(col1.size())) {
    return 0.0;
  }
  unsigned int numPairs = col1.size() - numSkip;
  unsigned int nn = alphabet.size() * alphabet.size(); // number of possible p.
  unsigned int numAllowed = allowedPairs.size(); // number of allowed pairs
  if (numPairs == 0) {
    return 0.0;
  }
  double p0 = static_cast<double>(numAllowed) / nn;
  double frac = static_cast<double>(numCorrect) / numPairs;
//   if (frac < 0.5) {
//     return 0.0; // less than half of sequences are compatible
//   }
  // 1 corresponds to most confidence
  double result = 1.0 - (nOverK(numPairs, numCorrect)
    * pow(p0, static_cast<double>(numCorrect)) 
    * pow(1.0 - p0, static_cast<double>(numPairs - numCorrect)));
  result *= frac; // deviate from binomial distribution in order to bias towards compatible changes
  if (verboseLevel > 2) {
    cout << "Score of col1, col2: " << result << " " 
	 << col1 << " " << col2 << " " << endl;
    }
  POSTCOND((result >= 0.0) && (result <= 1.0));
  return result;
}

/** returns entropy of combined columns 1 and 2 */
double
CompensationScorer::pairwiseEntropy(const string& col1,
				    const string& col2) const
{
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    double fi = frequency(col1, alphabet[i]);
    for (unsigned int j = 0; j < alphabet.size(); ++j) {
      double fij = frequency(col1, col2, alphabet[i], alphabet[j]);
      double fj = frequency(col2, alphabet[j]);
      if ((fij > 0.0) && (fi > 0.0) && (fj > 0.0)) {
// 	if(fij > fi*fj) {
// 	  cout << fij << " " << fi << " " << fj << " " << fi * fj << " "
// 	       << alphabet[i] << " " << alphabet[j] << endl;
// 	  cout << col1 << endl << col2 << endl;
// 	  exit(1);
// 	}
	result -= fij * lg2(fij/(fi*fj));
      }
    }
  }
  // POSTCOND(result >= 0.0);
  return result;
}


/** returns entropy of combined columns 1 and 2 */
double
CompensationScorer::pairwiseEntropy(const string& col1,
				    const string& col2,
				    const Vec<double>& wVec) const
{
  PRECOND((col1.size() == col2.size()) && (col1.size() == wVec.size()));
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    double fi = frequency(col1, alphabet[i]);
    for (unsigned int j = 0; j < alphabet.size(); ++j) {
      double fij = frequency(col1, col2, alphabet[i], alphabet[j], wVec);
      double fj = frequency(col2, alphabet[j], wVec);
      if ((fij > 0.0) && (fi > 0.0) && (fj > 0.0)) {
// 	if(fij > fi*fj) {
// 	  cout << fij << " " << fi << " " << fj << " " << fi * fj << " "
// 	       << alphabet[i] << " " << alphabet[j] << endl;
// 	  cout << col1 << endl << col2 << endl;
// 	  exit(1);
// 	}
	// result -= (wVec[i] * wVec[j]) * fij * lg2(fij/(fi*fj));
	result -= fij * lg2(fij/(fi*fj));
      }
    }
  }
  // POSTCOND(result >= 0.0);
  return result;
}


/** returns entropy of combined columns 1 and 2,
    but only take the part that is matching */
double
CompensationScorer::pairwiseEntropy2(const string& col1,
				    const string& col2) const
{
  PRECOND(col1.size() == col2.size());
  double result = 0.0;
  // const double LOG2 = log(2.0);
  unsigned int n = col1.size();
  Vec<char> c1Vec(col1.size());
  Vec<char> c2Vec(col2.size());
  unsigned int counter = 0;
  for (unsigned int i = 0; i < n; ++i) {
    if (isAllowedPair(col1[i], col2[i])) {
      c1Vec[counter] = col1[i];
      c2Vec[counter] = col2[i];
      ++counter;
    }
  }
  string s1(counter,'X');
  string s2(counter,'X');
  for (unsigned int i = 0; i < counter; ++i) {
    s1[i] = c1Vec[i];
    s2[i] = c2Vec[i];
  }
  // POSTCOND(result >= 0.0);
  result = pairwiseEntropy(s1, s2);
  return result;
}

/** returns entropy of combined columns 1 and 2,
    but only take the part that is matching,
    use weight vector
*/
double
CompensationScorer::pairwiseEntropy2(const string& col1,
				     const string& col2,
				     const Vec<double>& w) const
{
  PRECOND(col1.size() == col2.size());
  double result = 0.0;
  // const double LOG2 = log(2.0);
  unsigned int n = col1.size();
  Vec<char> c1Vec(col1.size());
  Vec<char> c2Vec(col2.size());
  unsigned int counter = 0;
  Vec<double> wVec(w.size(), 0.0);
  for (unsigned int i = 0; i < n; ++i) {
    if (isAllowedPair(col1[i], col2[i])) {
      c1Vec[counter] = col1[i];
      c2Vec[counter] = col2[i];
      wVec[counter] = w[i];
      ++counter;
    }
  }
  string s1(counter,'X');
  string s2(counter,'X');
  Vec<double> wVecFinal(counter);
  for (unsigned int i = 0; i < counter; ++i) {
    s1[i] = c1Vec[i];
    s2[i] = c2Vec[i];
    wVecFinal[i] = wVec[i];
  }
  probabilityNormalize(wVecFinal); // normalize such that sum is one
  // POSTCOND(result >= 0.0);
  result = pairwiseEntropy(s1, s2, wVecFinal);
  return result;
}

/** returns entropy of combined columns 1 and 2.
 Use P log P  with approx correction term a la Tom Schneider */
double
CompensationScorer::singleEntropy3(const string& col1,
				   const Vec<double>& wVec) const
{
  PRECOND((col1.size() == wVec.size()));

  unsigned int numGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if (col1[i] == GAP_CHAR) {
      ++numGaps;
    }
  }
  unsigned int numAligned = col1.size()-numGaps;
  // if only one sequence or less:
  if (numAligned < MIN_ALIGNED) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    double fi = frequency2(col1, alphabet[i], wVec);
    if (fi > 0.0) {
      result -= fi * lg2(fi);
    }
  }

  // small sample correction according to Tom Schneider (1986): INCREASE in uncertainty, decrease in information 
  // due to small sample size:
  const double twolg2 = 2.0 * log(2.0);
  result += (alphabet.size()-1.0)/(twolg2*numAligned);
  
  result = lg2(static_cast<double>(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information

  // POSTCOND(result >= 0.0);
  return result;
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider,
 use Bayesian pseudocounts*/
double
CompensationScorer::singleEntropy4(const string& col1,
				   const Vec<double>& wVec) const
{
  PRECOND((col1.size() == wVec.size()));
  unsigned int numGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if (col1[i] == GAP_CHAR) {
      ++numGaps;
    }
  }
  unsigned int numAligned = col1.size()-numGaps;
  // if only one sequence or less:
  if (numAligned < MIN_ALIGNED) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    double fi = frequency3(col1, alphabet[i], 1, alphabet.size(), wVec);
    if (fi > 0.0) {
      result -= fi * lg2(fi);
    }
  }
  // count gaps:
  
  result = lg2(static_cast<double>(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information
  const double twolg2 = 2.0 * log(2.0);
  // small sample correction according to Tom Schneider (1986): DECREASE in information, INCREASE of uncertainty:
  result -= (alphabet.size()-1.0)/(twolg2*numAligned);
//   result = result / (1.0 
// 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));

  // POSTCOND(result >= 0.0);
  return result;
}


/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider (but no correction),
 use Bayesian pseudocounts*/
double
CompensationScorer::singleEntropy5(const string& col1,
				   const Vec<double>& wVec) const
{
  PRECOND((col1.size() == wVec.size()));
  unsigned int numGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if (col1[i] == GAP_CHAR) {
      ++numGaps;
    }
  }
  unsigned int numAligned = col1.size()-numGaps;
  // if only one sequence or less:
  if (numAligned < MIN_ALIGNED) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    double fi = frequency3(col1, alphabet[i], 1, alphabet.size(), wVec);
    if (fi > 0.0) {
      result -= fi * lg2(fi);
    }
  }
  // count gaps:
  
  result = lg2(static_cast<double>(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information
  // small sample correction according to Tom Schneider (1986):
  // result -= (alphabet.size()-1.0)/(twolg2*numAligned);
//   result = result / (1.0 
// 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));

  // POSTCOND(result >= 0.0);
  return result;
}


/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider (but no correction),
 do NOT use Bayesian pseudocounts*/
double
CompensationScorer::singleEntropy6(const string& col1,
				   const Vec<double>& wVec) const
{
  PRECOND((col1.size() == wVec.size()));
  PRECOND(alphabet.size() > 0);
  unsigned int numGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if (col1[i] == GAP_CHAR) {
      ++numGaps;
    }
  }
  unsigned int numAligned = col1.size()-numGaps;
  // if only one sequence or less:
  if (numAligned < MIN_ALIGNED) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  bool found = false;
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    double fi = frequency2(col1, alphabet[i], wVec);
    if (fi > 0.0) {
      result -= fi * lg2(fi);
      found = true;
    }
  }
  
  if (!found) {
    return 0.0;
  }
  result = lg2(static_cast<double>(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information
  // cout << result << " final Result of " << col1 << " " << wVec << endl;
  // small sample correction according to Tom Schneider (1986):
  // result -= (alphabet.size()-1.0)/(twolg2*numAligned);
//   result = result / (1.0 
// 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));

  // POSTCOND(result >= 0.0);
  return result;
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider (but no correction),
 do NOT use Bayesian pseudocounts, 
 NOT only matching nucleotides */
double
CompensationScorer::singleEntropy7(const string& col1,
				   const Vec<double>& wVec) const
{
  PRECOND((col1.size() == wVec.size()));
  PRECOND(alphabet.size() > 0);
  unsigned int numGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if (col1[i] == GAP_CHAR) {
      ++numGaps;
    }
  }
  unsigned int numAligned = col1.size()-numGaps;
  // if only one sequence or less:
  if (numAligned < MIN_ALIGNED) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  bool found = false;
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    double fi = frequency2(col1, alphabet[i], wVec);
    if (fi > 0.0) {
      result -= fi * lg2(fi);
      found = true;
    }
  }
  // count gaps:
  //   cout << result << " tmp Result of " << col1 << " " << wVec << " " << alphabet.size() << " " 
  //        << numAligned << " " << col1.size() << " " << numGaps << endl;
  if (!found) {
    cout << "Strange line (only gaps or unknown charaters: " << col1 << endl;
  }
  result = lg2(static_cast<double>(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information
  // cout << result << " final Result of " << col1 << " " << wVec << endl;
  // small sample correction according to Tom Schneider (1986):
  // result -= (alphabet.size()-1.0)/(twolg2*numAligned);
  //   result = result / (1.0 
  // 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));
  // POSTCOND(result >= 0.0);
  return result;
}


/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider,
 correction due to random sampling,
 do NOT use Bayesian pseudocounts, 
 NOT only matching nucleotides */
double
CompensationScorer::singleEntropy8Raw(const string& col1,
				      const Vec<double>& wVec) const
{
  PRECOND((col1.size() == wVec.size()));
  PRECOND(alphabet.size() > 0);
  unsigned int numGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if (col1[i] == GAP_CHAR) {
      ++numGaps;
    }
  }
  unsigned int numAligned = col1.size()-numGaps;
  // if only one sequence or less:
  if (numAligned < MIN_ALIGNED) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  bool found = false;
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    double fi = frequency2(col1, alphabet[i], wVec);
    if (fi > 0.0) {
      result -= fi * lg2(fi);
      found = true;
    }
  }
  // count gaps:
//   cout << result << " tmp Result of " << col1 << " " << wVec << " " << alphabet.size() << " " 
//        << numAligned << " " << col1.size() << " " << numGaps << endl;
  
  if (!found) {
    return 0.0;
  }
  // result = lg2(static_cast<double>(alphabet.size())) - result; // return 2 - uncertainty = information: decrease in information
  // cout << result << " final Result of " << col1 << " " << wVec << endl;
  // small sample correction according to Tom Schneider (1986):
  // result -= (alphabet.size()-1.0)/(twolg2*numAligned);
//   result = result / (1.0 
// 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));

  // POSTCOND(result >= 0.0);
  return result;
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider,
 correction due to random sampling,
 do NOT use Bayesian pseudocounts, 
 NOT only matching nucleotides */
double
CompensationScorer::singleEntropy8(const string& col1,
				   const Vec<double>& wVec) const
{
  PRECOND((col1.size() == wVec.size()));
  PRECOND(alphabet.size() > 0);
  if (randomSampleNum < 1) {
    return singleEntropy7(col1, wVec); // return without correction
  }
  unsigned int numGap = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if (col1[i] == GAP_CHAR) {
      ++numGap;
    }
  }
  if (numGap == col1.size()) {
    return 0.0;
  }
  unsigned int effSize = col1.size() - numGap;
  if (effSize >= 50) {
    return singleEntropy3(col1, wVec); // using simplified correction term
  }
  ASSERT(col1.size() == wVec.size());
  double result = singleEntropy8Raw(col1, wVec);
  double sum = 0.0;
  double squareSum = 0.0;
  double term;
  string randomSequence(effSize, 'X');
  // count number of gaps:
  Random& rnd = Random::getInstance();
  unsigned int n = 0;
  unsigned int minN = 10;
  for (unsigned int i = 0; i < randomSampleNum; ++i) {
    generateRandomSequence(randomSequence, alphabet, rnd);
    ASSERT(randomSequence.size() == wVec.size());
    term = singleEntropy8Raw(randomSequence, wVec);
    sum += term;
    squareSum += term * term;
    ++n;
    if ((n > minN) 
	&& (sqrt(varianceFromSum(sum, squareSum, n)/n) < entropyStdMax)) {
      break;
    }
  }
  ASSERT(n > 0);
  sum /= n;
  // result = lg2(static_cast<double>(alphabet.size())) - result; // return 2 - uncertainty = information: decrease in information

  result = sum - result; // return expected uncertainty - uncertainty = information: decrease in information

  // cout << result << " final Result of " << col1 << " " << wVec << endl;
  // small sample correction according to Tom Schneider (1986):
  // result -= (alphabet.size()-1.0)/(twolg2*numAligned);
  //   result = result / (1.0 
  // 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));
  // POSTCOND(result >= 0.0);
  return result;
}

/** exact way to compute excpected uncertainty.
 * Adapted almost verbatim from Pascal implementation by Tom Schneider
 * at web site:
 * @see http://www.lecb.ncifcrf.gov/~toms/paper/schneider1986/latex/node28.html
*/
/*
double 
CompensationScorer::exactSingleExpectedUncertainty(int n, const Vec<double>& letterFrequencies)
{
  ERROR_IF(alphabet.size() != 4, 
	   "exactSingleExpectedUncertainty works only with 4-letter alphabets.",
	   exception);
  int na = 0;
  int nc = 0;
  int ng = 0;
  int nt = 0;
  bool done = false;
  do {
    if (nt > 0) {
      // ending on a t - do outer loops
      if (ng > 0) {
	ng = ng - 1;
	nt = nt + 1; // turn g into t
      }
      else if (nc > 0) {
	// turn one c into g and all t to g (note ng = 0 initially
	nc = nc - 1;
	ng = nt + 1;
	nt = 0;
      }
      else if (na > 0) {
	// turn one a into c and all g and t to c (note ng=nc=0 initially)
	na = na -1;
	nc = nt + 1;
	nt = 0;
      }
      else {
	done = true;
      }
    }
    else {
      // no t-increment innermost loop
      if (ng > 0) {
	ng = ng - 1;
	nt = nt + 1;
      }
      else if (nc > 0) {
	// turn c into g
	nc = nc - 1;
	ng = ng + 1;
      }
      else { // na > 0; turn a into c 
	na = na - 1;
	nc = nc + 1;
      }
    }
  }
  while (!done);
}
*/

/** compute factorial.
 * TODO: faster implementation with lookup tables
 */
double
CompensationScorer::computeFactorial(int n) const {
  double f = 1.0;
  for (int i = 1; i <= n; ++i) {
    f *= i;
  }
  return f;
}

/** compute factorial.
 * TODO: faster implementation with lookup tables
 */
double
CompensationScorer::computeFactorialLog(int n) const {
  double f = 0.0;
  for (int i = 1; i <= n; ++i) {
    f += log(static_cast<double>(i));
  }
  return f;
}

/** exact way to compute excpected uncertainty.
 * Adapted almost verbatim from Pascal implementation by Tom Schneider
 * at web site:
 * @see http://www.lecb.ncifcrf.gov/~toms/paper/schneider1986/latex/node28.html
*/
double 
CompensationScorer::compositionProbability(int n, 
			   const Vec<int>& letterCounts,
			   const Vec<double>& letterFrequencies) const
{
  double resultLog = computeFactorialLog(n);
  for (unsigned int i = 0; i < letterFrequencies.size(); ++i) {
    resultLog = resultLog - computeFactorialLog(letterCounts[i]) + log(letterFrequencies[i]);
  }
  return exp(resultLog);
}

/**corresponds to equatation 12  at web site:
 * @see http://www.lecb.ncifcrf.gov/~toms/paper/schneider1986/latex/node28.html
*/
double 
CompensationScorer::compositionUncertainty(int n, 
					   const Vec<int>& letterCounts) const
{
  double result = 0;
  for (unsigned int i = 0; i < letterCounts.size(); ++i) {
    if (letterCounts[i] > 0) {
      double f = static_cast<double>(letterCounts[i]) / static_cast<double>(n);
      result -= f * lg2(f);
    }
  }
  return result;
}


/** exact way to compute excpected uncertainty.
 * Adapted almost verbatim from Pascal implementation by Tom Schneider
 * at web site:
 * @see http://www.lecb.ncifcrf.gov/~toms/paper/schneider1986/latex/node28.html
*/
double 
CompensationScorer::exactSingleExpectedUncertainty(int n, 
			   const Vec<double>& letterBaseFrequencies) const
{
  ERROR_IF(alphabet.size() != 4, 
	   "exactSingleExpectedUncertainty works only with 4-letter alphabets.");
  double result = 0.0;
  Vec<int> letterCounts(4);
  for (int na = 0; na <= n; ++na) {
    for (int nc = 0; nc <= n; ++nc) {
      int nac = na + nc;
      if (nac > n) {
	continue;
      }
      for (int ng = 0; ng <= n; ++ng) {
	int nacg = nac + ng;
	if (nacg > n) {
	  continue;
	}
	for (int nt = 0; nt <= n; ++nt) {
	  int nacgt = nacg + nt;
	  if (nacgt > n) {
	    continue;
	  }
	  letterCounts[0] = na;
	  letterCounts[1] = nc;
	  letterCounts[2] = ng;
	  letterCounts[3] = nt;
	  result +=  compositionProbability(n, letterCounts, 
					    letterBaseFrequencies)
	    * compositionUncertainty(n, letterCounts);
	}
      }
    }
  }
  return result;
}

/* begin module calehnb */
void 
CompensationScorer::calehnb(long n, long gna, long gnc, long gng, long gnt, 
			    double* hg, double* ehnb, double* varhnb) const
// long n, gna, gnc, gng, gnt;
// double *hg, *ehnb, *varhnb;
{
  const int maxsize = 200; // maximum size for logarithm
  long accuracy = 10000;
  /* ; debugging: boolean */
  /* calculate e(hnb) in bits/bp (ehnb) for a number (n) of example sequence
sites.  gna to gnt are the composition to use for the genome probabilities
of a to t.  the genomic uncertainty hg and the variance var(hnb)
(=varhnb) are also calculated. if the variable debugging is passed to the procedure then the individual
combinations of hnb are displayed.

        note: this procedure should not be broken into smaller
	procedures so that it remains efficient.
        version = 3.02; of procedure calehnb 1983 nov 23 */
  /* less than (1/accuracy) bits error is demanded
  for the sum of pnb (see variable 'total') at the end of the procedure */

  double log2 = log(2.0);   /* natural log of 2, used to find log base 2 */
  double logn;   /* log of n */
  double nlog2;   /* n * log2 */

  long gn;   /* sum of gna..gnt */
  double logpa, logpc, logpg, logpt;   /* logs of genome probabilities */

  /* log of n factorial is the sum of i=1 to n of log(i).
  the array below represents these logs up to n */
  double logfact[maxsize + 1];

  /* precalculated values of -p*log2(p), where p=nb/n for
     nb = 0 .. n.  m stands for minus */
  double mplog2p[maxsize + 1];

  long i;   /* index for logfact and mplog2p */
  double logi;   /* natural log of i */

  long na;
  long nc = 0, ng = 0, nt = 0;   /* numbers of bases in a site */
  bool done = false;   /* true when the loop is completed */

  double pnb;
  /* multinomial probability of a combination
                of na, nc, ng, nt */
  double hnb;   /* uncertainty for a combination of na..nt */
  double pnbhnb;   /* pnb*hnb, an intermediate result */
  double sshnb = 0.0;   /* sum of squares of hnb */

  /* variables for testing program correctness: */
  double total = 0.0;
  /* sum of pnb over all combinations of na..nt
     if this is not 1.00, the program is in error */
  long counter = 0;

  /* counts the number of times through
     the loop */

  /* prevent access to outside the arrays: */
  if (n > maxsize) {
    ERROR("n larger maximum size!");
  }

  logn = log((double)n);
  nlog2 = n * log2;

  /* get logs of genome probabilities */
  gn = gna + gnc + gng + gnt;
  logpa = log((double)gna / gn);
  logpc = log((double)gnc / gn);
  logpg = log((double)gng / gn);
  logpt = log((double)gnt / gn);

  /* find genomic uncertainty */
  *hg = -((gna * logpa + gnc * logpc + gng * logpg + gnt * logpt) / (gn * log2));

  *ehnb = 0.0;   /* start error uncertainty at zero */

  /* make table of log of n factorial up to n
     and entropies for nb/n */
  logfact[0] = 0.0;   /* factorial(0) = 0 */
  mplog2p[0] = 0.0;
  for (i = 1; i <= n; i++) {
    logi = log((double)i);
    logfact[i] = logfact[i-1] + logi;
    mplog2p[i] = i * (logn - logi) / nlog2;
  }

  /* begin by looking at the combination with all a: na = n */
  na = n;

  /* the following loop simulates a number of nested loops
  of the form:
     for b1=a to t do
        for b2=b1 to t do
           for b3=b2 to t do
              ...
                 for bn=b(n-1) to t do ...
  the resulting set of variables increase in alphabetic order
  since no inner loop variable can have a value less than any
  outer loop.  the number of times through the inner-most loop
  is given by:
     o = (n + 1)*(n + 2)*(n + 3)/6
  in the case where there are four symbols (a,c,g,t) and n is
  the number of nested loops.
     a recursive set of loops would be possible, but it
  would use up too much memory in practical cases (up to n=150
  or higher).  a second algorithm sequests the loop variables
  into an array and increments them there.  however, the goal
  is to get all possible combinations for na, nc, ng, nt, where
  the sum of these is n.  the nested loops provide all the
  combinations in alphabetic order, assuring that there can not
  be any duplicates.  to find nb (one of na..nt) one would look
  at which of the variables b1 to bn were of value b.  this is
  a wasteful operation.
     the loop below simulates the array of control variables
  by changing each nb directly.
  */

  do {
    /* pnb is calculated by taking the log of the expression

               fact(n)          na     nc     ng     nt
    pnb = ------------------- pa   * pc   * pg   * pt  .
          fact(na).. fact(nt)

    log(pnb) generates a series of sums, allowing
    the calculation to proceed by addition and
    multiplication rather than multiplication and
    exponentiation.  the factorials become tractable
    in this way */

    pnb = exp(logfact[n] - logfact[na] - logfact[nc] - logfact[ng] -
	      logfact[nt] + na * logpa + nc * logpc + ng * logpg + nt * logpt);
	/* n factorial */

    hnb = mplog2p[na] + mplog2p[nc] + mplog2p[ng] + mplog2p[nt];

    pnbhnb = pnb * hnb;

    *ehnb += pnbhnb;

    sshnb += pnbhnb * hnb;   /* sum of squares of hnb */

    /* the following section keeps track of the calculation
    and writes out the current set of nb. */
    counter++;
    /*         if debugging then begin
                write(output,' ',counter:2,' ');
                for i := 1 to na do write(output,'a');
                for i := 1 to nc do write(output,'c');
                for i := 1 to ng do write(output,'g');
                for i := 1 to nt do write(output,'t');
                write(output,' ',na:3,nc:3,ng:3,nt:3);
                writeln(output,' pnb = ',pnb:10:5);
             end;  */
    total += pnb;

    /* the remaining portion of this repeat loop generates
    the values of na, nc, ng and nt.  notice that
    there are 7 possibilities at each loop increment.
    other than the stop, in each case the sum of
    na+nc+ng+nt remains constant (=n). */
    if (nt > 0) {  /* ending on a t - do outer loops */
      if (ng > 0) {  /* turn g into t */
	ng--;
	nt++;
      } else if (nc > 0) {
	/* turn one c into g,
	   and all t to g (note ng = 0 initially) */
	nc--;
	ng = nt + 1;
	nt = 0;
      } else if (na > 0) {
	/* turn one a into c and
	   all g and t to c. (note ng=nc=0 initially) */
	na--;
	nc = nt + 1;
	nt = 0;
      } else
	done = true;   /* since nt = n */
    } else {
      if (ng > 0) {  /* turn g into t */
	ng--;
	nt++;
      } else if (nc > 0) {  /* turn c into g */
	nc--;
	ng++;
      } else {
	na--;
	nc++;
	/* na > 0; turn a into c */
      }
    }
  } while (!done);

  /* no t - increment innermost loop */
  /* final adjustment: we only have the sum of squares so far */
  *varhnb = sshnb - *ehnb * *ehnb;

  /* if this message appears, there is either a bug in the code or
     the computer cannot be as accurate as requested */
  if (accuracy != (long)floor(accuracy * total + 0.5)) {
    printf(" procedure calehnb: the sum of probabilities is\n");
    printf(" not accurate to one part in %ld\n", (long)accuracy);
    printf(" the sum of the probabilities is %10.8f\n", total);
  }

  /* if this message appear, then there is an error in the
     repeat-until loop: it did not repeat as many times as
     is expected from the algorithm */
  if (counter == (long)floor((n + 1.0) * (n + 2) * (n + 3) / 6 + 0.5))
    return;
  /*      writeln(output, '    total: ',total:10:5);
        writeln(output,'    count = ',counter:1);
        writeln(output,'    (n+1)*(n+2)*(n+3)/6 = ',
                            round((n+1)*(n+2)*(n+3)/6):1); */
  ERROR(" procedure calehnb: program error, the number of calculations is in error\n");
}  /* calehnb */


/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider,
 EXACT correction due to random sampling,
 do NOT use Bayesian pseudocounts, 
 NOT only matching nucleotides */
double
CompensationScorer::singleEntropy9(const string& col1, const Vec<double>& wVec) const
{
  PRECOND((col1.size() == wVec.size()));
  PRECOND(alphabet.size() > 0);
  unsigned int numGap = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if (col1[i] == GAP_CHAR) {
      ++numGap;
    }
  }
  if (numGap == col1.size()) {
    return 0.0;
  }
  unsigned int effSize = col1.size() - numGap;
  if (effSize >= 50) {
    return singleEntropy3(col1, wVec); // using simplified correction term
  }
  ASSERT(col1.size() == wVec.size());
  double result = singleEntropy8Raw(col1, wVec);
  // call original calehnb:
  double hg = 0;
  double ehnb = 0;
  double varhnb = 0;
  long count = 1;
  calehnb((long)effSize, count, count, count, count, &hg, &ehnb, &varhnb);
  cout << "Results of calehnb: " << hg << " " << ehnb << " " << varhnb << endl;
  result = ehnb - result; // return expected uncertainty - uncertainty = information: decrease in information
  // result = exactSingleExpectedUncertainty(effSize, getBaseFrequencies()) - result; // return expected uncertainty - uncertainty = information: decrease in information

  return result;
}


/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider,
 EXACT correction due to random sampling,
 do NOT use Bayesian pseudocounts, 
 NOT only matching nucleotides */
double
CompensationScorer::singleEntropy9(const string& col1) const
{
  return singleEntropy9(col1, Vec<double>(col1.size(), 1.0));
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider, small sample correction */
double
CompensationScorer::pairwiseEntropy3(const string& col1,
				     const string& col2,
				     const Vec<double>& wVec) const
{
  PRECOND((col1.size() == col2.size()) && (col1.size() == wVec.size()));
  unsigned int numGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++numGaps;
    }
  }
  unsigned int numAligned = col1.size()-numGaps;
  // if only one sequence or less:
  if (numAligned < MIN_ALIGNED) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    // double fi = frequency2(col1, alphabet[i]);
    for (unsigned int j = 0; j < alphabet.size(); ++j) {
      double fij = frequency2(col1, col2, alphabet[i], alphabet[j], wVec);
      // double fj = frequency2(col2, alphabet[j]);
      if (fij > 0.0) {
	result -= fij * lg2(fij);
      }
    }
  }
  // small sample correction according to Tom Schneider (1986): INCREASE of uncertainty, DECREASE of information
  const double twolg2 = 2.0 * log(2.0);
  result += ((alphabet.size()*alphabet.size())-1.0)/(twolg2*numAligned);
  result = (2.0 * lg2(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information
//   result = result / (1.0 
// 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));
  // store only difference to single column entropy:
  result -= (singleEntropy3(col1, wVec)+singleEntropy3(col2, wVec));

  // POSTCOND(result >= 0.0);
  return result;
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider, small sample correction */
double
CompensationScorer::pairwiseEntropy3fast(const string& col1,
					 const string& col2,
					 const Vec<double>& wVec) const
{
  PRECOND((col1.size() == col2.size()) && (col1.size() == wVec.size()));
  clearLookUp(alphabet);
  addCountsToLookUp(col1, col2, wVec);
  unsigned int numAligned = col1.size()-freqLookUpGaps;
  // if only one sequence or less:
  if ((numAligned < MIN_ALIGNED) || (freqLookUpNorm <= 0.0)) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    // double fi = frequency2(col1, alphabet[i]);
    for (unsigned int j = 0; j < alphabet.size(); ++j) {
      // double fij = frequency2(col1, col2, alphabet[i], alphabet[j], wVec);
      double fij = freqLookUp[static_cast<int>(alphabet[i])][static_cast<int>(alphabet[j])]/freqLookUpNorm;
      // double fj = frequency2(col2, alphabet[j]);
      if (fij > 0.0) {
	result -= fij * lg2(fij);
      }
    }
  }
  // count gaps:
  // small sample correction according to Tom Schneider (1986): INCREASE of uncertainty, decrease of information:
  const double twolg2 = 2.0 * log(2.0);
  result += ((alphabet.size()*alphabet.size())-1.0)/(twolg2*numAligned);
  result = (2.0 * lg2(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information

//   result = result / (1.0 
// 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));
  // store only difference to single column entropy:
  result -= (singleEntropy3(col1, wVec)+singleEntropy3(col2, wVec));

  // POSTCOND(result >= 0.0);
  return result;
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider (including correction term),
 use Bayesian pseudo-counts , small sample correction
*/
double
CompensationScorer::pairwiseEntropy4(const string& col1,
				     const string& col2,
				     const Vec<double>& wVec) const
{
  PRECOND((col1.size() == col2.size()) && (col1.size() == wVec.size()));
  unsigned int numGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++numGaps;
    }
  }
  unsigned int numAligned = col1.size()-numGaps;
  // if only one sequence or less:
  if (numAligned < MIN_ALIGNED) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    // double fi = frequency3(col1, alphabet[i], 1, alphabet.size()*alphabet.size());
    for (unsigned int j = 0; j < alphabet.size(); ++j) {
      double fij = frequency3(col1, col2, alphabet[i], alphabet[j], 1, alphabet.size()*alphabet.size(),
			      wVec);
      // double fj = frequency2(col2, alphabet[j]);
      if (fij > 0.0) {
	result -= fij * lg2(fij);
      }
    }
  }
  // count gaps:
  result = (2.0 * lg2(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information
  const double twolg2 = 2.0 * log(2.0);
  // small sample correction according to Tom Schneider (1986): DECREASE of information:
  result -= ((alphabet.size()*alphabet.size())-1.0)/(twolg2*numAligned);
//   result = result / (1.0 
// 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));
  // store only difference to single column entropy:
  result -= (singleEntropy4(col1, wVec)+singleEntropy4(col2, wVec));

  // POSTCOND(result >= 0.0);
  return result;
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider (NOT including correction term),
 use Bayesian pseudo-counts 
*/
double
CompensationScorer::pairwiseEntropy5(const string& col1,
				     const string& col2,
				     const Vec<double>& wVec) const
{
  PRECOND((col1.size() == col2.size()) && (col1.size() == wVec.size()));
  unsigned int numGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++numGaps;
    }
  }
  unsigned int numAligned = col1.size()-numGaps;
  // if only one sequence or less:
  if (numAligned < MIN_ALIGNED) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    // double fi = frequency3(col1, alphabet[i], 1, alphabet.size()*alphabet.size());
    for (unsigned int j = 0; j < alphabet.size(); ++j) {
      double fij = frequency3(col1, col2, alphabet[i], alphabet[j], 1, alphabet.size()*alphabet.size(),
			      wVec);
      // double fj = frequency2(col2, alphabet[j]);
      if (fij > 0.0) {
	result -= fij * lg2(fij);
      }
    }
  }
  // count gaps:
  result = (2.0 * lg2(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information
  // small sample correction according to Tom Schneider (1986):
  // result -= ((alphabet.size()*alphabet.size())-1.0)/(twolg2*numAligned);
//   result = result / (1.0 
// 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));
  // store only difference to single column entropy:
  result -= (singleEntropy5(col1, wVec)+singleEntropy5(col2, wVec));

  // POSTCOND(result >= 0.0);
  return result;
}


/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider (NOT including correction term),
 NOT use Bayesian pseudo-counts 
*/
double
CompensationScorer::pairwiseEntropy6(const string& col1,
				     const string& col2,
				     const Vec<double>& wVec) const
{
  PRECOND((col1.size() == col2.size()) && (col1.size() == wVec.size()));
  unsigned int numGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++numGaps;
    }
  }
  unsigned int numAligned = col1.size()-numGaps;
  // if only one sequence or less:
  if (numAligned < MIN_ALIGNED) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    // double fi = frequency3(col1, alphabet[i], 1, alphabet.size()*alphabet.size());
    for (unsigned int j = 0; j < alphabet.size(); ++j) {
      double fij = frequency2(col1, col2, alphabet[i], alphabet[j], wVec);
      // double fj = frequency2(col2, alphabet[j]);
      if (fij > 0.0) {
	result -= fij * lg2(fij);
      }
    }
  }
  // count gaps:
  result = (2.0 * lg2(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information
  // small sample correction according to Tom Schneider (1986):
  // result -= ((alphabet.size()*alphabet.size())-1.0)/(twolg2*numAligned);
//   result = result / (1.0 
// 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));
  // store only difference to single column entropy:
  result -= (singleEntropy6(col1, wVec)+singleEntropy6(col2, wVec));

  // POSTCOND(result >= 0.0);
  return result;
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider (NOT including correction term),
 NOT use Bayesian pseudo-counts , NOT using only matching nucleotides
*/
double
CompensationScorer::pairwiseEntropy7(const string& col1,
				     const string& col2,
				     const Vec<double>& wVec) const
{
  PRECOND((col1.size() == col2.size()) && (col1.size() == wVec.size()));
  unsigned int numGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++numGaps;
    }
  }
  unsigned int numAligned = col1.size()-numGaps;
  // if only one sequence or less:
  if (numAligned < MIN_ALIGNED) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    // double fi = frequency3(col1, alphabet[i], 1, alphabet.size()*alphabet.size());
    for (unsigned int j = 0; j < alphabet.size(); ++j) {
      double fij = frequency2(col1, col2, alphabet[i], alphabet[j], wVec);
      // double fj = frequency(col2, alphabet[j]);
      if (fij > 0.0) {
	result -= fij * lg2(fij);
      }
    }
  }
  // count gaps:
  result = (2.0 * lg2(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information
  // small sample correction according to Tom Schneider (1986):
  // result -= ((alphabet.size()*alphabet.size())-1.0)/(twolg2*numAligned);
//   result = result / (1.0 
// 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));
  // store only difference to single column entropy:
  result -= (singleEntropy7(col1, wVec)+singleEntropy6(col2, wVec));

  // POSTCOND(result >= 0.0);
  return result;
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider (NOT including correction term),
 NOT use Bayesian pseudo-counts , NOT using only matching nucleotides
*/
double
CompensationScorer::pairwiseEntropy7(const string& col1,
				     const string& col2,
				     double singleVal1,
				     double singleVal2,
				     const Vec<double>& wVec) const
{
  PRECOND((col1.size() == col2.size()) && (col1.size() == wVec.size()));
  unsigned int numGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++numGaps;
    }
  }
  unsigned int numAligned = col1.size()-numGaps;
  // if only one sequence or less:
  if (numAligned < MIN_ALIGNED) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    // double fi = frequency3(col1, alphabet[i], 1, alphabet.size()*alphabet.size());
    for (unsigned int j = 0; j < alphabet.size(); ++j) {
      double fij = frequency2(col1, col2, alphabet[i], alphabet[j], wVec);
      // double fj = frequency(col2, alphabet[j]);
      if (fij > 0.0) {
	result -= fij * lg2(fij);
      }
    }
  }
  // count gaps:
  result = (2.0 * lg2(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information
  // small sample correction according to Tom Schneider (1986):
  // result -= ((alphabet.size()*alphabet.size())-1.0)/(twolg2*numAligned);
//   result = result / (1.0 
// 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));
  // store only difference to single column entropy:
  result -= (singleVal1+singleVal2);

  // POSTCOND(result >= 0.0);
  return result;
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider, small sample correction */
double
CompensationScorer::pairwiseEntropy8RawFast(const string& col1,
					    const string& col2,
					    const Vec<double>& wVec) const
{
  PRECOND((col1.size() == col2.size()) && (col1.size() == wVec.size()));
  clearLookUp(alphabet);
  addCountsToLookUp(col1, col2, wVec);
  unsigned int numAligned = col1.size()-freqLookUpGaps;
  // if only one sequence or less:
  if ((numAligned < MIN_ALIGNED) || (freqLookUpNorm <= 0.0)) {
    return 0.0;
  }
  double result = 0.0;
  // const double LOG2 = log(2.0);
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    // double fi = frequency2(col1, alphabet[i]);
    for (unsigned int j = 0; j < alphabet.size(); ++j) {
      // double fij = frequency2(col1, col2, alphabet[i], alphabet[j], wVec);
      double fij = freqLookUp[static_cast<int>(alphabet[i])][static_cast<int>(alphabet[j])]/freqLookUpNorm;
      // double fj = frequency2(col2, alphabet[j]);
      if (fij > 0.0) {
	result -= fij * lg2(fij);
      }
    }
  }
  // count gaps:
  // small sample correction according to Tom Schneider (1986): INCREASE of uncertainty, decrease of information:
  //   const double twolg2 = 2.0 * log(2.0);
  //   result += ((alphabet.size()*alphabet.size())-1.0)/(twolg2*numAligned);
  //   result = (2.0 * lg2(alphabet.size())) - result; // return 4 - uncertainty = information: decrease in information
  // //   result = result / (1.0 
  // // 	    + 1.0/lg2(static_cast<double>(col1.size()-numGaps)));
  //   // store only difference to single column entropy:
  //   result -= (singleEntropy3(col1, wVec)+singleEntropy3(col2, wVec));
  // POSTCOND(result >= 0.0);
  return result;
}

/** returns expected uncertainty of 2 columns */
double
CompensationScorer::expectedUncertainty2Columns(const string& alph, unsigned int effSize, double error,
					unsigned int randomSampleSquareNum,
					unsigned int minN) const
{
  Random& rnd = Random::getInstance();
  string ranSeq1(effSize, 'X');
  string ranSeq2(effSize, 'X');
  unsigned int n = 0;
  double term;
  double sum = 0;
  double squareSum = 0.0;
  Vec<double> wVec(effSize, 1.0);
  for (unsigned int i = 0; i < randomSampleSquareNum; ++i) {
    generateRandomSequence(ranSeq1, alph, rnd);
    generateRandomSequence(ranSeq2, alph, rnd);
    ASSERT(ranSeq1.size() == ranSeq2.size());
    ASSERT(ranSeq1.size() == wVec.size());
    term = pairwiseEntropy8RawFast(ranSeq1, ranSeq2, wVec);
    sum += term;
    squareSum += term * term;
    ++n;
    if ((n > minN)
	&& (sqrt(varianceFromSum(sum, squareSum, n)/n) < error)) {
	break;
    }
  }
  ASSERT(n > 0);
  sum /= n;
  return sum;
}

/** returns standard deviation of single value of expected uncertainty of 2 columns */
double
CompensationScorer::expectedUncertainty2ColumnsStddev(const string& alph, unsigned int effSize, double error,
						      unsigned int randomSampleSquareNum,
						      unsigned int minN) const
{
  Random& rnd = Random::getInstance();
  string ranSeq1(effSize, 'X');
  string ranSeq2(effSize, 'X');
  unsigned int n = 0;
  double term;
  double sum = 0;
  double squareSum = 0.0;
  if (verboseLevel > 1) {
    cout << "Starting expectedUncertainty2ColumnsStddev! " << alph << " " << effSize << " " << error << endl;
  }
  Vec<double> wVec(effSize, 1.0);
  for (unsigned int i = 0; i < randomSampleSquareNum; ++i) {
    generateRandomSequence(ranSeq1, alph, rnd);
    generateRandomSequence(ranSeq2, alph, rnd);
    ASSERT(ranSeq1.size() == ranSeq2.size());
    ASSERT(ranSeq1.size() == wVec.size());
    term = pairwiseEntropy8RawFast(ranSeq1, ranSeq2, wVec);
    sum += term;
    squareSum += term * term;
    ++n;
    if (n > 2) {
      double std = sqrt(varianceFromSum(sum, squareSum, n));
      double stdmean = std / sqrt(static_cast<double>(n));
      if (verboseLevel > 1) {
	cout << n << " " << term << " " << sum << " " << squareSum << " +- "
	     << std << " " << stdmean << endl;
      }
      if ((n > minN)
	  && (stdmean < error)) {
	break;
      }
    }
  }
  ASSERT(n > 0);
  double stddev = sqrt(varianceFromSum(sum, squareSum, n));
  if (verboseLevel > 1) {
    cout << "Ending expectedUncertainty2ColumnsStddev!" << endl;
  }
  return stddev;
}

/** returns two alignment columns without gaps */
pair<string, string>
CompensationScorer::filterGaps(const string& col1Orig, const string& col2Orig, char gapChar)
{
  PRECOND(col1Orig.size() == col2Orig.size());
  int pc = 0;
  string s1 = col1Orig;
  string s2 = col2Orig;
  string empty;
  for (unsigned int i = 0; i < col1Orig.size(); ++i) {
    if ((col1Orig[i] != gapChar) && (col2Orig[i] != gapChar)) {
      s1[pc] = col1Orig[i];
      s2[pc++] = col2Orig[i];
    }
  }
  if (pc == 0) {
    pair<string, string>(empty, empty);
  }
  s1 = s1.substr(0, pc);
  s2 = s2.substr(0, pc);
  return pair<string, string>(s1, s2);
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider, small sample correction */
double
CompensationScorer::pairwiseEntropy8Fast(const string& col1Orig,
					 const string& col2Orig,
					 const Vec<double>& wVecOrig) const
{
  PRECOND((col1Orig.size() == col2Orig.size()) && (col1Orig.size() == wVecOrig.size()));
  pair<string, string> filteredSeq = filterGaps(col1Orig, col2Orig, GAP_CHAR);
  const string& col1 = filteredSeq.first;
  const string& col2 = filteredSeq.second;
  if (col1.size() == 0){
    return 0.0;
  }
  unsigned int effSize = col1.size();
  Vec<double> wVec(effSize, 1.0);
  ASSERT(col1.size() == col2.size());
  ASSERT(col1.size() == wVec.size());
  double result = pairwiseEntropy8RawFast(col1, col2, wVec); // sum p log p
  double sum = 0.0;
  double squareSum = 0.0;
  double term;
  Random& rnd = Random::getInstance();
  string ranSeq1(effSize, 'X');
  string ranSeq2(effSize, 'X');
  unsigned int n = 0;
  unsigned int minN = 10;
  for (unsigned int i = 0; i < randomSampleSquareNum; ++i) {
    generateRandomSequence(ranSeq1, alphabet, rnd);
    generateRandomSequence(ranSeq2, alphabet, rnd);
    ASSERT(ranSeq1.size() == ranSeq2.size());
    ASSERT(ranSeq1.size() == wVec.size());
    term = pairwiseEntropy8RawFast(ranSeq1, ranSeq2, wVec);
    sum += term;
    squareSum += term * term;
    ++n;
    if ((n > minN)
	&& (sqrt(varianceFromSum(sum, squareSum, n)/n) < entropyStdMax)) {
	break;
    }
  }
  ASSERT(n > 0);
  sum /= n;
  result = sum - result;
  result -= (singleEntropy8(col1, wVec)+singleEntropy8(col2, wVec));

  // POSTCOND(result >= 0.0);
  return result;
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider, small sample correction */
double
CompensationScorer::pairwiseEntropy9Fast(const string& col1Orig,
					 const string& col2Orig,
					 const Vec<double>& wVecOrig) const
{
  PRECOND((col1Orig.size() == col2Orig.size()) && (col1Orig.size() == wVecOrig.size()));
  pair<string, string> filteredSeq = filterGaps(col1Orig, col2Orig, GAP_CHAR);
  const string& col1 = filteredSeq.first;
  const string& col2 = filteredSeq.second;
  if (col1.size() == 0){
    return 0.0;
  }
  unsigned int effSize = col1.size();
  const double twolg2 = 2.0 * log(2.0);
  Vec<double> wVec(effSize, 1.0);
  ASSERT(col1.size() == col2.size());
  ASSERT(col1.size() == wVec.size());
  double expectedUncertainty = 0.0;
  double result = pairwiseEntropy8RawFast(col1, col2, wVec); // sum p log p
  if (effSize < 100) {
    double sum = 0.0;
    double squareSum = 0.0;
    double term;
    Random& rnd = Random::getInstance();
    string ranSeq1(effSize, 'X');
    string ranSeq2(effSize, 'X');
    unsigned int n = 0;
    unsigned int minN = 10;
    for (unsigned int i = 0; i < randomSampleSquareNum; ++i) {
      generateRandomSequence(ranSeq1, alphabet, rnd);
      generateRandomSequence(ranSeq2, alphabet, rnd);
      ASSERT(ranSeq1.size() == ranSeq2.size());
      ASSERT(ranSeq1.size() == wVec.size());
      term = pairwiseEntropy8RawFast(ranSeq1, ranSeq2, wVec);
      sum += term;
      squareSum += term * term;
      ++n;
      if ((n > minN) && (sqrt(varianceFromSum(sum, squareSum, n)/n) < entropyStdMax)) {
	break;
      }
    }
    expectedUncertainty = sum / n;
  }
  else { // correction term
    double correction = ((alphabet.size()*alphabet.size()) - 1) / (twolg2*effSize);
    // expcected uncertainty for lg2(16) = 4 :
    expectedUncertainty = lg2(static_cast<double>(alphabet.size()*alphabet.size())) - correction;
  }
  result = expectedUncertainty - result;
  result -= (singleEntropy9(col1, wVec)+singleEntropy9(col2, wVec));

  // POSTCOND(result >= 0.0);
  return result;
}


/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider, small sample correction */
double
CompensationScorer::pairwiseEntropy9Fast(const string& col1Orig,
					 const string& col2Orig,
					 double singleVal1,
					 double singleVal2,
					 const Vec<double>& wVecOrig) const
{
  PRECOND((col1Orig.size() == col2Orig.size()) && (col1Orig.size() == wVecOrig.size()));
  pair<string, string> filteredSeq = filterGaps(col1Orig, col2Orig, GAP_CHAR);
  const string& col1 = filteredSeq.first;
  const string& col2 = filteredSeq.second;
  if (col1.size() == 0){
    return 0.0;
  }
  unsigned int effSize = col1.size();
  const double twolg2 = 2.0 * log(2.0);
  Vec<double> wVec(effSize, 1.0);
  ASSERT(col1.size() == col2.size());
  ASSERT(col1.size() == wVec.size());
  double expectedUncertainty = 0.0;
  double result = pairwiseEntropy8RawFast(col1, col2, wVec); // sum p log p
  if (effSize < 100) {
    double sum = 0.0;
    double squareSum = 0.0;
    double term;
    Random& rnd = Random::getInstance();
    string ranSeq1(effSize, 'X');
    string ranSeq2(effSize, 'X');
    unsigned int n = 0;
    unsigned int minN = 10;
    for (unsigned int i = 0; i < randomSampleSquareNum; ++i) {
      generateRandomSequence(ranSeq1, alphabet, rnd);
      generateRandomSequence(ranSeq2, alphabet, rnd);
      ASSERT(ranSeq1.size() == ranSeq2.size());
      ASSERT(ranSeq1.size() == wVec.size());
      term = pairwiseEntropy8RawFast(ranSeq1, ranSeq2, wVec);
      sum += term;
      squareSum += term * term;
      ++n;
      if ((n > minN) && (sqrt(varianceFromSum(sum, squareSum, n)/n) < entropyStdMax)) {
	break;
      }
    }
    expectedUncertainty = sum / n;
  }
  else { // correction term
    double correction = ((alphabet.size()*alphabet.size()) - 1) / (twolg2*effSize);
    // expcected uncertainty for lg2(16) = 4 :
    expectedUncertainty = lg2(static_cast<double>(alphabet.size()*alphabet.size())) - correction;
  }
  result = expectedUncertainty - result;
  result -= (singleVal1 + singleVal2);

  // POSTCOND(result >= 0.0);
  return result;
}


/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider, small sample correction */
double
CompensationScorer::pairwiseEntropy8Fast(const string& col1Orig,
					 const string& col2Orig,
					 double singleVal1,
					 double singleVal2,
					 const Vec<double>& wVecOrig) const
{
  PRECOND((col1Orig.size() == col2Orig.size()) && (col1Orig.size() == wVecOrig.size()));
  pair<string, string> filteredSeq = filterGaps(col1Orig, col2Orig, GAP_CHAR);
  const string& col1 = filteredSeq.first;
  const string& col2 = filteredSeq.second;
  if (col1.size() == 0){
    return 0.0;
  }
  unsigned int effSize = col1.size();
  Vec<double> wVec(effSize, 1.0);
  ASSERT(col1.size() == col2.size());
  ASSERT(col1.size() == wVec.size());
  double result = pairwiseEntropy8RawFast(col1, col2, wVec); // sum p log p
  double sum = 0.0;
  double squareSum = 0.0;
  double term;
  Random& rnd = Random::getInstance();
  string ranSeq1(effSize, 'X');
  string ranSeq2(effSize, 'X');
  unsigned int n = 0;
  unsigned int minN = 10;
  for (unsigned int i = 0; i < randomSampleSquareNum; ++i) {
    generateRandomSequence(ranSeq1, alphabet, rnd);
    generateRandomSequence(ranSeq2, alphabet, rnd);
    ASSERT(ranSeq1.size() == ranSeq2.size());
    ASSERT(ranSeq1.size() == wVec.size());
    term = pairwiseEntropy8RawFast(ranSeq1, ranSeq2, wVec);
    sum += term;
    squareSum += term * term;
    ++n;
    if ((n > minN)
	&& (sqrt(varianceFromSum(sum, squareSum, n)/n) < entropyStdMax)) {
	break;
    }
  }
  ASSERT(n > 0);
  sum /= n;
  result = sum - result;
  result -= (singleVal1+singleVal2);

  // POSTCOND(result >= 0.0);
  return result;
}


/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider */
double
CompensationScorer::pairwiseEntropy3(const string& col1,
				     const string& col2) const
{
  PRECOND((col1.size() == col2.size()));
  Vec<double> wVec(col1.size(), 1.0);
  return pairwiseEntropy3(col1, col2, wVec);
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider,
 also use Bayesian pseudocounts. */
double
CompensationScorer::pairwiseEntropy4(const string& col1,
				     const string& col2) const
{
  PRECOND((col1.size() == col2.size()));
  Vec<double> wVec(col1.size(), 1.0);
  return pairwiseEntropy4(col1, col2, wVec);
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider,
 also use Bayesian pseudocounts. */
double
CompensationScorer::pairwiseEntropy5(const string& col1,
				     const string& col2) const
{
  PRECOND((col1.size() == col2.size()));
  Vec<double> wVec(col1.size(), 1.0);
  return pairwiseEntropy5(col1, col2, wVec);
}

/** returns entropy of combined columns 1 and 2.
 Only use P log P  like Tom Schneider, no sample correction
 NOT using Bayesian pseudocounts. */
double
CompensationScorer::pairwiseEntropy6(const string& col1,
				     const string& col2) const
{
  PRECOND((col1.size() == col2.size()));
  Vec<double> wVec(col1.size(), 1.0);
  return pairwiseEntropy6(col1, col2, wVec);
}


/** returns entropy of combined columns 1 and 2 */
// double
// CompensationScorer::pairwiseEntropyMax(const string& col1,
// 				       const string& col2) const
// {
//   PRECOND(alphabet.size() > 0);
//   PRECOND(col1.size() > 0);
//   double result = 0.0;
//   // const double LOG2 = log(2.0);
//   double fi = 1.0; //  / static_cast<double>(alphabet.size());
//   double fij = 1.0 / static_cast<double>(col1.size()); // static_cast<double>((alphabet.size() * alphabet.size()));
//   double fj = 1.0; //  / static_cast<double>(alphabet.size());
//   // double fi = frequency(col1, alphabet[i]);
//   // double fij = frequency(col1, col2, alphabet[i], alphabet[j]);
//   // double fj = frequency(col2, alphabet[j]);
//   result = - alphabet.size() * alphabet.size() * fij * lg2(fij/(fi*fj));
//   // result = - alphabet.size() * alphabet.size() * log
//   return result;
// }

/** returns entropy of combined columns 1 and 2 */
double
CompensationScorer::pairwiseEntropyMax(unsigned int n) const
{
  PRECOND(alphabet.size() > 0);
  string col1(n,'X');
  string col2(n,'X');
  unsigned int numIter = 10;
  double highest = -1e30; // pairwiseEntropy(col1, col2);
  Random& rnd = Random::getInstance();
  double x = 0.0;
  for (unsigned int i = 0; i < numIter; ++i) {
    for (unsigned int j = 0; j < n; ++j) {
      col1[j] = alphabet[rnd.getRand() % alphabet.size()];
      col2[j] = alphabet[rnd.getRand() % alphabet.size()];
    }
    x = pairwiseEntropy(col1, col2);
    // cout << "Random entropy with length " << n << " : " << x << endl;
    ERROR_IF(!isReasonable(x), "Internal error in line 411!");
    if (x > highest) {
      highest = x;
    }
  }
  return highest;
}

/** returns entropy of combined columns 1 and 2 */
double
CompensationScorer::pairwiseEntropyMin(unsigned int n) const
{
  PRECOND(alphabet.size() > 0);
  string col1(n,'X');
  unsigned int numIter = 10;
  double lowest = 1e30; // pairwiseEntropy(col1, col2);
  Random& rnd = Random::getInstance();
  double x = 0.0;
  for (unsigned int i = 0; i < numIter; ++i) {
    for (unsigned int j = 0; j < n; ++j) {
      col1[j] = alphabet[rnd.getRand() % alphabet.size()];
    }
    x = pairwiseEntropy(col1, col1);
    // cout << "Random entropy with length " << n << " : " << x << endl;
    ERROR_IF(!isReasonable(x), "Internal error in line 411!");
    if (x < lowest) {
      lowest = x;
    }
  }
  return lowest;
}

/** covariance score according to Hofacker 2002 */
double
CompensationScorer::covariance1(const string& col1, const string& col2) const
{
  double sum = 0.0;
  unsigned int nn = alphabet.size();
  double fij, fij2, dh;
  for (unsigned int i = 0; i < nn; ++i) {
    for (unsigned int j = 0; j < nn; ++j) {
      if (!isAllowedPair(alphabet[i], alphabet[j])) {
	continue;
      }
      fij = frequency2(col1, col2, alphabet[i], alphabet[j]);
      for (unsigned int i2 = 0; i2 < nn; ++i2) {
	for (unsigned int j2 = 0; j2 < nn; ++j2) {
	  if (!isAllowedPair(alphabet[i2], alphabet[j2])) {
	    continue;
	  }
	  fij2 = frequency2(col1, col2, alphabet[i2], alphabet[j2]);
	  // Hamming distance: how many character changer between pairs:
	  dh = 2 - deltaFunction(alphabet[i], alphabet[i2])
	    - deltaFunction(alphabet[j2], alphabet[j2]);
	  sum += fij * dh * fij2;
	}
      }
    }
  }
  return sum;
}

/** returns entropy of combined columns 1 and 2 */
double
CompensationScorer::relativeEntropy(const string& col1,
				    const string& col2) const
{
  double result = pairwiseEntropy(col1,col2) / pairwiseEntropyMax(col1.size());
  if (result > 1.0) {
    result = 1.0;
  }
  // ASSERT((result >= 0.0) && (result <= 1.0));
  return result;
}

unsigned int
CompensationScorer::computeMeanAndDeviation(const string& col,
					    double& mean,
					    double& dev) const
{
  double sum = 0.0;
  double sqSum = 0.0;
  unsigned int counter = 0;
  Vec<unsigned int> idVec(col.size());
  for (unsigned int i = 0; i < col.size(); ++i) {
    idVec[i] = letterId(col[i]);
    if (idVec[i] < subMatrix.size()) {
      ++counter;
    }
  }
  unsigned int n = col.size();
  unsigned int nNum = (n * (n - 1)) / 2;
  double term;
  for (unsigned int i = 0; i < col.size(); ++i) {
    for (unsigned int j = i+1; j < col.size(); ++j) {
      if ((idVec[i] >= subMatrix.size())
	  || (idVec[j] >= subMatrix[idVec[i]].size())) {
	continue; // skip letter
      }
      term = subMatrix[idVec[i]][idVec[j]];
      sum += term;
      sqSum += (term * term);
      ++counter;
    }
  }
  if (counter == 0) {
    counter = 1;
  }
  mean = sum / nNum; // counter;
  dev = sqrt(sqSum / nNum - mean * mean);
  return counter;
}

/** returns entropy of combined columns 1 and 2 */
double
CompensationScorer::pairwiseCorrelation(const string& col1,
					const string& col2) const
{
  PRECOND(subMatrix.size() == alphabet.size());
  PRECOND(col1.size() == col2.size());
  double result = 0.0;
  double mean1 = 0.0;
  double mean2 = 0.0;
  double std1 = 0.0;
  double std2 = 0.0;
  double count1 = computeMeanAndDeviation(col1, mean1, std1);
  double count2 = computeMeanAndDeviation(col2, mean2, std2);
  if (((count1 / col1.size()) < gapFracMin)
      || ((count2 / col2.size()) < gapFracMin)) {
    return 0.0;
  }
  if ((std1 == 0.0) || (std2 == 0.0)) {
    return 0.0;
  }
  Vec<unsigned int> idVec1(col1.size());
  Vec<unsigned int> idVec2(col2.size());
  for (unsigned int i = 0; i < col1.size(); ++i) {
    idVec1[i] = letterId(col1[i]);
  }
  for (unsigned int i = 0; i < col2.size(); ++i) {
    idVec2[i] = letterId(col2[i]);
  }
  unsigned int n = col1.size();
  unsigned int nNum = (n * (n - 1)) / 2;
  double term1;
  double term2;
  unsigned int counter = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    for (unsigned int j = i+1; j < col2.size(); ++j) {
      //       ASSERT((idVec1[i] < subMatrix.size())
      // 	     && (idVec1[j] < subMatrix[idVec1[i]].size()));
      //       ASSERT((idVec2[i] < subMatrix.size())
      // 	     && (idVec2[j] < subMatrix[idVec2[i]].size()));
      if ((idVec1[i] >= subMatrix.size())
	  || (idVec1[j] >= subMatrix[idVec1[i]].size())) {
	continue; // skip letter
      }
      if ((idVec2[i] >= subMatrix.size())
	  || (idVec2[j] >= subMatrix[idVec2[i]].size())) {
	continue; // skip letter
      }
      term1 = subMatrix[idVec1[i]][idVec1[j]];
      term2 = subMatrix[idVec2[i]][idVec2[j]];
      result += ((term1 - mean1)/std1)*((term2-mean2)/std2);
      ++counter;
    }
  }
  if (counter == 0) {
    counter = 1;
  }
  result /= nNum; // counter;
  return result;
}

/** returns 1 for highly reliable prediction,
    0 for totally uncompatible prediction */
double
CompensationScorer::entropy(const string& col) const {
  double result = 0.0;
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    double fi = frequency(col, alphabet[i]);
    if (fi > 0.0) {
      result -= fi * log(fi);
    }
  }
  return result;
}

/** returns 1 for highly reliable prediction,
    0 for totally uncompatible prediction */
double
CompensationScorer::entropyColumnMax() const {
  PRECOND(alphabet.size() > 0);
  double result = 0.0;
  double fi = 1.0 / alphabet.size();
  result = - alphabet.size() * (fi * log(fi));
  return result;
}

/** returns uncorrected score */
double
CompensationScorer::uncorrectedScore(const string& col1,
				     const string& col2,
				     const Vec<double>& aliSequenceWeights) const 
{
  return entropyWeight * pairwiseEntropy7(col1, col2, aliSequenceWeights); 
}

/** returns 1 for highly reliable prediction,
    0 for totally uncompatible prediction */
double
CompensationScorer::errorScore(const string& col1,
			       const string& col2,
			       const Vec<double>& aliSequenceWeights) const {
  double correctedScore = compensationScore(col1, col2, aliSequenceWeights);
  double uncorrectedScore = entropyWeight * pairwiseEntropy7(col1, col2, aliSequenceWeights); 
  cout << "corrected: " << correctedScore << " uncorrected: " << uncorrectedScore << " " << col1 << " " << col2 << endl;
  return (uncorrectedScore - correctedScore);
}

/** returns 1 for highly reliable prediction,
    0 for totally uncompatible prediction */
double
CompensationScorer::compensationScore(const string& col1,
				      const string& col2,
				      const Vec<double>& aliSequenceWeights) const {
  double result = 0.0;
  switch (algorithm) {
  case 0:
    result = 0.0; // no computation (useful for printing only reference structure)
    break;
  case 1:
    result = compatibilityScore(col1, col2);
    break;
  case 2:
    result = energyWeight * compatibilityScore(col1, col2) 
	- entropyWeight * pairwiseEntropy(col1, col2);
    // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
    break;
  case 3:
    result = energyWeight * compatibilityScore2(col1, col2) 
      - entropyWeight * pairwiseEntropy(col1, col2);
    // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
    break;
  case 4:
    result = energyWeight * compatibilityScore2(col1, col2) 
      + entropyWeight * pairwiseCorrelation(col1, col2);
      // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
    break;
  case 5:
    result = - compatibilityScore2(col1, col2) * pairwiseEntropy(col1, col2);
    // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
    break;
  case 6:
    if (compatibilityScore2(col1, col2) >= compMin) {
      result =  - pairwiseEntropy(col1, col2);
    }
    else {
      result = 0.0;
    }
    break;
  case 7:
    if (compatibilityScore3(col1, col2) >= compMin) {
      result =  - entropyWeight * pairwiseEntropy(col1, col2);
    }
    else {
      result = 0.0;
    }
    break;
  case 8:
    result = - compatibilityScore3(col1, col2) * pairwiseEntropy(col1, col2);
    // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
    break;
  case 9:
    result = energyWeight * compatibilityScore3(col1, col2) 
      - entropyWeight * pairwiseEntropy(col1, col2);
    // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
    break;
  case 10:
    // scaling: compress by factor of 20
    result = - logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
		*  pairwiseEntropy(col1, col2);
    break;
  case 11:
    // scaling: compress by factor of 20
    result = logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
      *  (1.0 - relativeEntropy(col1, col2));
    break;
  case 12:
    // like case 10, but use pairwiseEntropy2
    result = - logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
                   *  pairwiseEntropy2(col1, col2);
    break;
  case 13:
    // like case 10, but use pairwiseEntropy2 and using sequence weighting
    result = - logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
      *  pairwiseEntropy2(col1, col2, aliSequenceWeights);
    break;
  case 14:
    result = 0.0;
    if (energyWeight != 0.0) {
      result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
      // result += entropyWeight * pairwiseEntropy3(col1, col2, aliSequenceWeights); 
      result += entropyWeight * pairwiseEntropy3fast(col1, col2, aliSequenceWeights); 
    }
    break;
  case 15:
    result = 0.0;
    if (energyWeight != 0.0) {
      result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
      result += entropyWeight * pairwiseEntropy4(col1, col2, aliSequenceWeights); 
    }
    break;
  case 16:
    result = 0.0;
    if (energyWeight != 0.0) {
      result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
      result += entropyWeight * pairwiseEntropy5(col1, col2);
    }
    break;
  case 17:
//     if (energyWeight != 0.0) {
//       result += energyWeight * compatibilityScore(col1, col2);
//     }
//     else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
    result = logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
      * pairwiseEntropy5(col1, col2);
      // }
    break;
  case 18: // information without small sample correction, no pseudocounts
    result = 0.0;
    if (energyWeight != 0.0) {
      result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
      result += entropyWeight * pairwiseEntropy6(col1, col2, aliSequenceWeights);
    }
    break;
  case 19:
    result = 0.5 * (singleEntropy3(col1, aliSequenceWeights) + singleEntropy3(col2, aliSequenceWeights));
    break;
  case 20:
    result = 0.5 * fabs(singleEntropy3(col1, aliSequenceWeights) - singleEntropy3(col2, aliSequenceWeights));
    break;
  case 21:
    result = logistic((singleEntropy3(col1, aliSequenceWeights)-userMean)/userDev 
		      + (singleEntropy3(col2, aliSequenceWeights)-userMean/userDev));
    break;
  case 22:
    result = logistic((singleEntropy3(col1, aliSequenceWeights)-userMean)/userDev 
		      - (singleEntropy3(col2, aliSequenceWeights)-userMean/userDev));
    break;  
  case 23:
    result = averageRandomRenamedAllowedPairs(col1, col2);
    break;
  case 24:
    // covariance score according to Hofacker:
    result = covariance1(col1, col2);
    break;
  case 25:
    // mutual information without any correction, not just matching nucleotides
    result = 0.0;
    if (energyWeight != 0.0) {
      result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
      result += entropyWeight * pairwiseEntropy7(col1, col2, aliSequenceWeights); 
      // result += entropyWeight * pairwiseEntropy7fast(col1, col2, aliSequenceWeights); 
    }
    break;
  case 26:
    // mutual information using random sampling correction
    result = 0.0;
    if (energyWeight != 0.0) {
      result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
      result += entropyWeight * pairwiseEntropy8Fast(col1, col2, aliSequenceWeights); 
    }
    break;
  case 27: // use average SINGLE column information!
    result = 0.5 * (singleEntropy8(col1, aliSequenceWeights) + singleEntropy8(col2, aliSequenceWeights));
    break;
  case 28: // use SINGLE column information of 5' end:
    result = singleEntropy8(col1, aliSequenceWeights);
    break;
  case 29: // use Single column information of 3' end
    result = singleEntropy8(col2, aliSequenceWeights);
    break;
  case 30: // most soffisticated way: use three different ways to correct for small sample size
    result = entropyWeight * pairwiseEntropy9Fast(col1, col2, aliSequenceWeights); 
    break;
  default: ERROR("Unknown algorithm in CompensationScorer! So far only 1 to 30 are defined.");
    }
  return result;
}

/** returns 1 for highly reliable prediction,
    0 for totally uncompatible prediction
    Faster version of previous: uses precomputed single values for algorithm 25 and 26.
*/
double
CompensationScorer::compensationScore(const string& col1,
				      const string& col2,
				      double singleVal1,
				      double singleVal2,
				      const Vec<double>& aliSequenceWeights) const {
  double result = 0.0;
  switch (algorithm) {
  case 0:
    result = 0.0; // no computation (useful for printing only reference structure)
    break;
  case 1:
    result = compatibilityScore(col1, col2);
    break;
  case 2:
    result = energyWeight * compatibilityScore(col1, col2) 
	- entropyWeight * pairwiseEntropy(col1, col2);
    // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
    break;
  case 3:
    result = energyWeight * compatibilityScore2(col1, col2) 
      - entropyWeight * pairwiseEntropy(col1, col2);
    // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
    break;
  case 4:
    result = energyWeight * compatibilityScore2(col1, col2) 
      + entropyWeight * pairwiseCorrelation(col1, col2);
      // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
    break;
  case 5:
    result = - compatibilityScore2(col1, col2) * pairwiseEntropy(col1, col2);
    // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
    break;
  case 6:
    if (compatibilityScore2(col1, col2) >= compMin) {
      result =  - pairwiseEntropy(col1, col2);
    }
    else {
      result = 0.0;
    }
    break;
  case 7:
    if (compatibilityScore3(col1, col2) >= compMin) {
      result =  - entropyWeight * pairwiseEntropy(col1, col2);
    }
    else {
      result = 0.0;
    }
    break;
  case 8:
    result = - compatibilityScore3(col1, col2) * pairwiseEntropy(col1, col2);
    // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
    break;
  case 9:
    result = energyWeight * compatibilityScore3(col1, col2) 
      - entropyWeight * pairwiseEntropy(col1, col2);
    // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
    break;
  case 10:
    // scaling: compress by factor of 20
    result = - logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
		*  pairwiseEntropy(col1, col2);
    break;
  case 11:
    // scaling: compress by factor of 20
    result = logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
      *  (1.0 - relativeEntropy(col1, col2));
    break;
  case 12:
    // like case 10, but use pairwiseEntropy2
    result = - logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
                   *  pairwiseEntropy2(col1, col2);
    break;
  case 13:
    // like case 10, but use pairwiseEntropy2 and using sequence weighting
    result = - logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
      *  pairwiseEntropy2(col1, col2, aliSequenceWeights);
    break;
  case 14:
    result = 0.0;
    if (energyWeight != 0.0) {
      result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
      // result += entropyWeight * pairwiseEntropy3(col1, col2, aliSequenceWeights); 
      result += entropyWeight * pairwiseEntropy3fast(col1, col2, aliSequenceWeights); 
    }
    break;
  case 15:
    result = 0.0;
    if (energyWeight != 0.0) {
      result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
      result += entropyWeight * pairwiseEntropy4(col1, col2, aliSequenceWeights); 
    }
    break;
  case 16:
    result = 0.0;
    if (energyWeight != 0.0) {
      result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
      result += entropyWeight * pairwiseEntropy5(col1, col2);
    }
    break;
  case 17:
//     if (energyWeight != 0.0) {
//       result += energyWeight * compatibilityScore(col1, col2);
//     }
//     else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
    result = logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
      * pairwiseEntropy5(col1, col2);
      // }
    break;
  case 18: // information without small sample correction, no pseudocounts
    result = 0.0;
    if (energyWeight != 0.0) {
      result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
      result += entropyWeight * pairwiseEntropy6(col1, col2, aliSequenceWeights);
    }
    break;
  case 19:
    result = 0.5 * (singleEntropy3(col1, aliSequenceWeights) + singleEntropy3(col2, aliSequenceWeights));
    break;
  case 20:
    result = 0.5 * fabs(singleEntropy3(col1, aliSequenceWeights) - singleEntropy3(col2, aliSequenceWeights));
    break;
  case 21:
    result = logistic((singleEntropy3(col1, aliSequenceWeights)-userMean)/userDev 
		      + (singleEntropy3(col2, aliSequenceWeights)-userMean/userDev));
    break;
  case 22:
    result = logistic((singleEntropy3(col1, aliSequenceWeights)-userMean)/userDev 
		      - (singleEntropy3(col2, aliSequenceWeights)-userMean/userDev));
    break;  
  case 23:
    result = averageRandomRenamedAllowedPairs(col1, col2);
    break;
  case 24:
    // covariance score according to Hofacker:
    result = covariance1(col1, col2);
    break;
  case 25:
    // mutual information without any correction, not just matching nucleotides
    result = 0.0;
    if (energyWeight != 0.0) {
      result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
      result += entropyWeight * pairwiseEntropy7(col1, col2, singleVal1, singleVal2, aliSequenceWeights); 
      // result += entropyWeight * pairwiseEntropy7fast(col1, col2, aliSequenceWeights); 
    }
    break;
  case 26:
    // mutual information using random sampling correction
    result = 0.0;
    if (energyWeight != 0.0) {
      result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      // use here information instead of uncertainty
      result += entropyWeight * pairwiseEntropy8Fast(col1, col2, singleVal1, singleVal2, aliSequenceWeights); 
    }
    break;
  case 30: // most soffisticated way: use three different ways to correct for small sample size
    result = entropyWeight * pairwiseEntropy9Fast(col1, col2, aliSequenceWeights); 
    break;
  default: ERROR("Unknown algorithm in CompensationScorer! So far only 1 to 22 are defined.");
    }
  return result;
}

/** returns 1 for highly reliable prediction,
    0 for totally uncompatible prediction */
double
CompensationScorer::singleScore(const string& col1, const Vec<double>& aliSequenceWeights) const {
  double result = 0.0;
  switch (algorithm) {
  case 0:
    result = 0.0;
    break;
    
//   case 1:
//     result = compatibilityScore(col1, col2);
//     break;
//   case 2:
//     result = energyWeight * compatibilityScore(col1, col2) 
// 	- entropyWeight * pairwiseEntropy(col1, col2);
//     // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
//     break;
//   case 3:
//     result = energyWeight * compatibilityScore2(col1, col2) 
//       - entropyWeight * pairwiseEntropy(col1, col2);
//     // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
//     break;
//   case 4:
//     result = energyWeight * compatibilityScore2(col1, col2) 
//       + entropyWeight * pairwiseCorrelation(col1, col2);
//       // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
//     break;
//   case 5:
//     result = - compatibilityScore2(col1, col2) * pairwiseEntropy(col1, col2);
//     // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
//     break;
//   case 6:
//     if (compatibilityScore2(col1, col2) >= compMin) {
//       result =  - pairwiseEntropy(col1, col2);
//     }
//     else {
//       result = 0.0;
//     }
//     break;
//   case 7:
//     if (compatibilityScore3(col1, col2) >= compMin) {
//       result =  - entropyWeight * pairwiseEntropy(col1, col2);
//     }
//     else {
//       result = 0.0;
//     }
//     break;
//   case 8:
//     result = - compatibilityScore3(col1, col2) * pairwiseEntropy(col1, col2);
//     // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
//     break;
//   case 9:
//     result = energyWeight * compatibilityScore3(col1, col2) 
//       - entropyWeight * pairwiseEntropy(col1, col2);
//     // + kt * (independentEntropy(col1, col2) - pairwiseEntropy(col1, col2));
//     break;
//   case 10:
//     // scaling: compress by factor of 20
//     result = - logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
// 		*  pairwiseEntropy(col1, col2);
//     break;
//   case 11:
//     // scaling: compress by factor of 20
//     result = logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
//       *  (1.0 - relativeEntropy(col1, col2));
//     break;
//   case 12:
//     // like case 10, but use pairwiseEntropy2
//     result = - logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
//                    *  pairwiseEntropy2(col1, col2);
//     break;
//   case 13:
//     // like case 10, but use pairwiseEntropy2 and using sequence weighting
//     result = - logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
//       *  pairwiseEntropy2(col1, col2, aliSequenceWeights);
//     break;
//   case 14:
//     result = 0.0;
//     if (energyWeight != 0.0) {
//       result += energyWeight * compatibilityScore(col1, col2);
//     }
//     else if (entropyWeight != 0.0) {
//       // use here information instead of uncertainty
//       // result += entropyWeight * pairwiseEntropy3(col1, col2, aliSequenceWeights); 
//       result += entropyWeight * pairwiseEntropy3fast(col1, col2, aliSequenceWeights); 
//     }
//     break;
//   case 15:
//     result = 0.0;
//     if (energyWeight != 0.0) {
//       result += energyWeight * compatibilityScore(col1, col2);
//     }
//     else if (entropyWeight != 0.0) {
//       // use here information instead of uncertainty
//       result += entropyWeight * pairwiseEntropy4(col1, col2, aliSequenceWeights); 
//     }
//     break;
//   case 16:
//     result = 0.0;
//     if (energyWeight != 0.0) {
//       result += energyWeight * compatibilityScore(col1, col2);
//     }
//     else if (entropyWeight != 0.0) {
//       // use here information instead of uncertainty
//       result += entropyWeight * pairwiseEntropy5(col1, col2);
//     }
//     break;
//   case 17:
// //     if (energyWeight != 0.0) {
// //       result += energyWeight * compatibilityScore(col1, col2);
// //     }
// //     else if (entropyWeight != 0.0) {
//       // use here information instead of uncertainty
//     result = logistic(25.0 * (compatibilityScore2(col1, col2) - compMin)) 
//       * pairwiseEntropy5(col1, col2);
//       // }
//     break;
//   case 18: // information without small sample correction, no pseudocounts
//     result = 0.0;
//     if (energyWeight != 0.0) {
//       result += energyWeight * compatibilityScore(col1, col2);
//     }
//     else if (entropyWeight != 0.0) {
//       // use here information instead of uncertainty
//       result += entropyWeight * pairwiseEntropy6(col1, col2, aliSequenceWeights);
//     }
//     break;
//   case 19:
//     result = 0.5 * (singleEntropy3(col1, aliSequenceWeights) + singleEntropy3(col2, aliSequenceWeights));
//     break;
//   case 20:
//     result = 0.5 * fabs(singleEntropy3(col1, aliSequenceWeights) - singleEntropy3(col2, aliSequenceWeights));
//     break;
//   case 21:
//     result = logistic((singleEntropy3(col1, aliSequenceWeights)-userMean)/userDev 
// 		      + (singleEntropy3(col2, aliSequenceWeights)-userMean/userDev));
//     break;
//   case 22:
//     result = logistic((singleEntropy3(col1, aliSequenceWeights)-userMean)/userDev 
// 		      - (singleEntropy3(col2, aliSequenceWeights)-userMean/userDev));
//     break;  
//   case 23:
//     result = averageRandomRenamedAllowedPairs(col1, col2);
//     break;
//   case 24:
//     // covariance score according to Hofacker:
//     result = covariance1(col1, col2);
//     break;
  case 25:
    // mutual information without any correction, not just matching nucleotides
    result = 0.0;
    if (energyWeight != 0.0) {
      ERROR("Energy weight not defined for single scores yet!");
      // result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      result += entropyWeight * singleEntropy7(col1, aliSequenceWeights); 
    }
    break;
  case 26:
    // mutual information using random sampling correction
    result = 0.0;
    if (energyWeight != 0.0) {
      ERROR("Energy weight not defined for single scores yet!");
      // result += energyWeight * compatibilityScore(col1, col2);
    }
    else if (entropyWeight != 0.0) {
      result += entropyWeight * singleEntropy8(col1, aliSequenceWeights);
    }
    break;
  case 30: // most soffisticated way: use three different ways to correct for small sample size
    result = entropyWeight * singleEntropy9(col1, aliSequenceWeights); 
    break;
  default: ERROR("Unknown algorithm in CompensationScorer! So far only 1 to 22 are defined.");
    }
  return result;
}



/** returns true if pair should be skipped */
bool
CompensationScorer::skipPair(char c1, char c2) const {
  unsigned int id1 = letterId(c1);
  unsigned int id2 = letterId(c2);
  if ((id1 >= alphabet.size())
      || (id2 >= alphabet.size())) {
    return true; // skip if for example "-" character
  }
  return ((c1 == GAP_CHAR) && (c2 == GAP_CHAR));
}

/** returns true if same pair indipendent of order */
bool
CompensationScorer::isSamePair(const string& s1, const string& s2) {
  if (((s1[0] == s2[0]) && (s1[1] == s2[1]))
      || ((s1[0] == s2[1]) && (s1[1] == s2[0]))) {
    return true;
    }
  return false;
}

/** returns allowed pairs */
Vec<string>
CompensationScorer::getUniqueAllowedPairs() const {
  Vec<string> result;
  for (unsigned int i = 0; i < allowedPairs.size(); ++i) {
    bool found = false;
    for (unsigned int j = 0; j < result.size(); ++j) {
      if (isSamePair(result[j], allowedPairs[i])) {
	found = true;
	break;
      }
    }
    if (!found) {
      result.push_back(allowedPairs[i]);
    }
  }
  return result;
}

/* MODIFIERS */

/* copy method */
void 
CompensationScorer::copy(const CompensationScorer& other)
{
  algorithm = other.algorithm;
  verboseLevel = other.verboseLevel;
  entropyStdMax = other.entropyStdMax;
  energyWeight = other.energyWeight;
  entropyWeight = other.entropyWeight;
  gapFracMin = other.gapFracMin;
  threshold = other.threshold;
  userMean = other.userMean;
  threshold = other.threshold;
  alphabet = other.alphabet;
  allowedPairs = other.allowedPairs;
  subMatrix = other.subMatrix;
  compatibilityExponent = other.compatibilityExponent;
  compMin = other.compMin;
  freqLookUpGaps = other.freqLookUpGaps;
  freqLookUpNorm = other.freqLookUpNorm;
  freqLookUp = other.freqLookUp;
  randomSampleNum = other.randomSampleNum;
  randomSampleSquareNum = other.randomSampleSquareNum;
}


void
CompensationScorer::clearLookUp(const string& alphabet) const
{
  for (unsigned int i = 0; i < alphabet.size(); ++i) {
    for (unsigned int j = 0; j < alphabet.size(); ++j) {
      freqLookUp[static_cast<int>(alphabet[i])][static_cast<int>(alphabet[j])] = 0;
    }
  }
}

void
CompensationScorer::addCountsToLookUp(const string& col1, const string& col2, 
				      const Vec<double>& wVec) const
{
  PRECOND(col1.size() == col2.size());
  freqLookUpNorm = 0.0;
  freqLookUpGaps = 0;
  for (unsigned int i = 0; i < col1.size(); ++i) {
    freqLookUp[static_cast<int>(col1[i])][static_cast<int>(col2[i])] += wVec[i];
    if ((col1[i] == GAP_CHAR) || (col2[i] == GAP_CHAR)) {
      ++freqLookUpGaps;
    }
    else {
      freqLookUpNorm += wVec[i];
    }
  }
}
