#include <limits.h>
#include <LocoRnaMain.h>
#include <iostream>
#include <iomanip>
#include <debug.h>
#include <MAFAlignment.h>
#include <MAFAlignmentTools.h>
#include <CorrelationFinder.h>
// #include <HashCorrelationFinder.h>
#include <HashCorrelationFinder3.h>
#include <Timer.h>
#include <fstream>
#include <GetArg.h>
#include <CorrelationTools.h>
#include <SearchTables.h>
#include <StemTools.h>
#include <MAFSearchTables.h>
#include <MAFSearchTables3.h>
#include <MainTools.h>
#include <BEDRegions.h>
#include <stemhelp.h>
#include <math.h>
#include <clusterAlgorithms.h>
#include <generalNumerics.h>
#include <InteractionClusterAnalyzer.h>
#include <AnnotationTools.h>

#ifdef COVARNA_CONCURRENT_VECTOR
#include <tbb/blocked_range.h>
#else
#include <ser_blocked_range.h>
#endif

#define REMARK cout << "# " 

void
LocoRnaMain::writeResults(ostream& os , const MAFAlignment& maf, const result_container& results, const string& refAssembly) {
  for (result_container::const_iterator it = results.begin(); it != results.end(); it++) {
    os << *it << " Blocks: " << (maf.getAlignmentId(it->getStart())+1) << ":" << (maf.getAlignmentColumnId(it->getStart()) + 1)
       << " " << (maf.getAlignmentId(it->getStop()) + 1) << ":" << (maf.getAlignmentColumnId(it->getStop()) + 1) << " ";
    os << " " << refAssembly << ": " << (maf.getAssemblyPosition(it->getStart(), refAssembly) + 1)<< " "
       << (maf.getAssemblyPosition(it->getStop(), refAssembly) + 1) << " ";
    MAFAlignmentTools::writeCommonColumnPair(os, maf, it->getStart(), it->getStop());
    os << endl;
  }  
}

void
LocoRnaMain::writeAssemblyResults(ostream& os, const result_container& results) {
  for (result_container::const_iterator it = results.begin(); it != results.end(); it++) {
    os << *it << endl;
  }  
}

void
LocoRnaMain::parseCommandLine() {
  if (argc < 2) {
    writeUsageMessage(*osp);
    ERROR("No filename specified!");
  }
  filename = argv[1];
  getArg("a", refAssembly, argc, argv, refAssembly);
  if (refAssembly == "_") {
    refAssembly = ""; // workaround for specifiyng to reference assembly
  }
  getArg("-ambiguity", ambiguityMode, argc, argv, ambiguityMode);
  getArg("-annotate", annotateInFileName, argc, argv, annotateInFileName);
  getArg("-annotate-out", annotateOutFileName, argc, argv, annotateOutFileName);
  getArg("-anti", checkAntiNeighborMode, argc, argv, checkAntiNeighborMode);
  getArg("b", bedFileName, argc, argv, bedFileName);
  getArg("-basepairs", basepairTypeMin, argc, argv, basepairTypeMin);
  getArg("-b1", bedFileName1, argc, argv, bedFileName1);
  getArg("-b2", bedFileName2, argc, argv, bedFileName2);
  if ((bedFileName1.size() == 0) && (bedFileName.size() > 0)) {
    bedFileName1 = bedFileName;
  }
  if ((bedFileName2.size() == 0) && (bedFileName.size() > 0)) {
    bedFileName2 = bedFileName;
  }
  getArg("-block-min", blockMin,argc, argv, blockMin);
  if (blockMin > 0) {
    --blockMin; // internal counting is zero-based
  } 
  getArg("-block-max", blockMax,argc, argv, blockMax);
  getArg("-block-min2", blockMin2, argc, argv, blockMin2);
  if (blockMin2 > 0) {
    --blockMin2; // internal counting is zero-based
  }
  getArg("-block-max2", blockMax2, argc, argv, blockMax2);
  getArg("c", collapseAssembly, argc, argv, collapseAssembly);
  getArg("-cluster", clusterCutoff, argc, argv, clusterCutoff);
  // bool clusterFilterInactive = false;
  if (isPresent("-cluster-filter-on", argc, argv)) {
      clusterFilterActive = true;
  }
  if (isPresent("-cluster-filter-off", argc, argv)) {
      clusterFilterActive = false;
  }
  // getArg("-cluster-filter-off", clusterFilterInactive, argc, argv);
  // clusterFilterActive = !clusterFilterInactive;
  getArg("-cluster-min", clusterColMin, argc, argv, clusterColMin);
  getArg("d", corrDistMin, argc, argv, corrDistMin);
  getArg("-density", stemDensity, argc, argv, stemDensity);
  getArg("-dif", densInFileName, argc, argv, densInFileName);
  getArg("-dof", densOutFileName, argc, argv, densOutFileName);
  getArg("e", appendFileName, argc, argv, appendFileName);
  if (appendFileName.size() > 0) {
    sameChrom = false; // default behavior: assume two MAF files: two different chromosomes. Otherwise specify "--same-chrom"
  } else {
    sameChrom = true;
  }
  getArg("-emax", eMax, argc, argv, eMax);
  getArg("-expand-max", expandClusterMaxAllowed, argc, argv, expandClusterMaxAllowed);
  getArg("f", assemblyPairFraction, argc, argv, assemblyPairFraction);
  ERROR_IF((assemblyPairFraction <= 0.0) || (assemblyPairFraction > 1.0),
	   "Assembly-pair-fraction (parameter -f) must be greater zero and smaller or equal one!");
  getArg("i", outputIntervall, argc, argv, outputIntervall);
  getArg("-ignore", tabooAssemblyNames, argc, argv, tabooAssemblyNames); // option "--taboo" is now called "--ignore"
  int complementModeInt = complementMode;
  getArg("m" , complementModeInt, argc, argv, complementModeInt);
  complementMode = complementModeInt;
  getArg("-multi", multiTestMode, argc, argv, multiTestMode);
  getArg("-noself", noSelfMode, argc, argv);  // no stems with start == stop position
  getArg("-opposite", oppositeMode, argc, argv);
  getArg("o", bedOutFileName, argc, argv, bedOutFileName); // output of results in UCSC BED file format
  getArg("p", pvalMode, argc, argv);
  getArg("-pad", padding, argc, argv, padding);
  getArg("-prune", pruneAfter, argc, argv, pruneAfter);
  getArg("-require", requiredAssemblyNames, argc, argv, requiredAssemblyNames);
  int reverseModeInt = reverseMode;
  getArg("r" , reverseModeInt, argc, argv, reverseModeInt);
  reverseMode = reverseModeInt;
  int seqMinInt = seqMin;
  getArg("s", seqMinInt, argc, argv, seqMinInt);
  if (!sameChrom) {
    getArg("-same-chrom", sameChrom, argc, argv); // checks if option --same-chrom is present: force to assume same chromosome for two MAF files
  }
  getArg("-stem", stemLengthMin, argc,argv, stemLengthMin); // minimum stem length (2 or 3)
  getArg("-stem-p", stemPMax, argc, argv, stemPMax);
  getArg("-search-max", searchColumnMax, argc, argv, searchColumnMax);
  getArg("-shuffle", shuffleMode, argc, argv, shuffleMode);
  getArg("-strand", strandMode1, argc, argv, strandMode1);
  getArg("-strand2", strandMode2, argc, argv, strandMode1); // use strand-mode 1 as default for strand 2
  getArg("-taboo", tabooAssemblyNames, argc, argv, tabooAssemblyNames);
  seqMin = seqMinInt; // workaround
  getArg("v", verbose, argc, argv, verbose);
  if (oppositeMode) {
    strandMode1 = MAFAlignment::STRAND_PLUS;
    strandMode2 = MAFAlignment::STRAND_MINUS;
    if (appendFileName.size() == 0) {
      appendFileName = filename; // reload first MAF file again as second file
      noSelfMode = true;
    }
  }
}

/** Return set of tokens */
set<string>
LocoRnaMain::tokenizeToSet(const string& line, string delimiter) {
  // REMARK << "Tokenizing :" << line << ":" << endl;
  set<string> result;
  if (line.size() > 0) {
    vector<string> words = getTokens(line, delimiter, false); // false: do not allow empty words
    for (vector<string>::const_iterator it = words.begin(); it != words.end(); it++) {
      // REMARK << "Inserting " << *it << endl;
      result.insert(*it);
    }
  }
  return result;
}

void
LocoRnaMain::reverseStemStarts(Vec<Stem>& stems, long _totLength) {
  ERROR_IF(_totLength > INT_MAX, "Total length of chromosome exceeds integer representation size");
  Stem::index_type totLength = static_cast<Stem::index_type>(_totLength);
  for (Vec<Stem>::size_type i = 0; i < stems.size(); ++i) {
    if (totLength < (stems[i].getStart()-stems[i].getLength())) {
      REMARK << "Problem with stem " << (i+1) << ": " << stems[i] 
	     << " . Total length of chrom: " <<  totLength 
	     << " . Inconsistent total length for found stem!" << endl;
    }
    ERROR_IF(totLength < (stems[i].getStart() - stems[i].getLength()),
	     "Inconsistent total length for found stem!");
    stems[i].setStart(totLength - stems[i].getStart() - stems[i].getLength());
    stems[i].setStop(stems[i].getStop() - (stems[i].getLength() - 1)); 
  }
}

void
LocoRnaMain::reverseStemStops(Vec<Stem>& stems, long _totLength) {
  ERROR_IF(_totLength > INT_MAX, "Total length of chromosome exceeds integer representation size");
  Stem::index_type totLength = static_cast<Stem::index_type>(_totLength);
  for (Vec<Stem>::size_type i = 0; i < stems.size(); ++i) {
    if (totLength < (stems[i].getStop()-1) ) {
      REMARK << "Problem with stem " << (i+1) << ": " << stems[i] 
	     << " . Total length of chrom: " <<  totLength 
	     << " . Inconsistent total length for found stem!" << endl;
    }
    ERROR_IF(totLength < (stems[i].getStop()-1),
	     "Inconsistent total length for found stem!");
    stems[i].setStop(totLength - stems[i].getStop() - 1);
  }
}


void
LocoRnaMain::writeWelcomeMessage(ostream& os) {
  os << "# Welcome to COVARNA (v" << getVersion() << "), the alignment covariation finder." << endl;
}

void
LocoRnaMain::writeUsageMessage(ostream& os) {
  os << "Usage:" << endl << "covarna filename [options]" << endl
     << "Input file: Sequence alignment in UCSC Genome Browser MAF format" << endl
     << "Options:" << endl
     << "-a ASSEMBLY    : specify name of reference assembly." << endl
     << "--ambiguity 1|2|3 : GU match ambiguity mode; 1: No GU matches , 2: reverse complement GU matches, 3: GU-match mode for forward matches. This option should be simplified in the future."
     << "--annotate FILENAME : count number of covarying alignment column pairs in existing regions (given in BED file format)" << endl
     << "--annotate-out FILENAME : output of count of number of covarying alignment column pairs in existing regions (given in BED file format)" << endl
     << "--anti 0|1|2   : filter checking \"wrong\" diagonals not to be complemtary. 0: off (default); 2: most strict"
     << endl
     << "-b FILTERFILE  : filter all input alignments by these regions. Must be in UCSC Genome Browser BED format." << endl
     << "--basepairs 2|3|4 : at least this many different types of base pairs for each covarying alignment column pair." << endl
     << "--b1 FILTERFILE  : filter first input alignment by these regions. Must be in UCSC Genome Browser BED format." << endl
     << "--b2 FILTERFILE  : filter first second alignment by these regions. Must be in UCSC Genome Browser BED format." << endl
     << "--block-min NUMBER : first MAF block to read. Default: 1" << endl
     << "--block-max NUMBER : last MAF block to read. Default: 0 (read all MAF blocks)" << endl
     << "--block-min2 NUMBER : first MAF block to read from second MAF file. Default: 1" << endl
     << "--block-max2 NUMBER : last MAF block to read from second MAF file. Default: 0 (read all MAF blocks)" << endl
     << "-c ASSEMBLY    : collapse alignments with respect to this assembly. Example: -c hg18" << endl
     << "-cluster DISTANCE : distance cutoff (in bases) for clustering of found stems." << endl
     << "--cluster-filter-on  : if set, single-linkage clustering is performed during the search. The use of this option is discouraged." << endl
     << "--cluster-filter-off  : if set, no single-linkage clustering is performed during the search." << endl
     << "--cluster-min 1...n  : minimum number of column pairs with compensatory base changes." << endl
     << "-d DISTANCE    : Minimum distance (integer) of correlated alignment columns" << endl
     << "-e FILENAME    : Filename of second alignment file (MAF format) to be appended. If specified, only cross-correlations between those two files are reported." << endl
     << "--expand-max VALUE : Maximum allowed length of consecutive sequence covariation. Default: 30" << endl
     << "-f FRACTION    : Determines fraction of possible hash tables that are actually generated. Values: (0,1]. Higher values mean faster search and more memory consumption. Default: 1.0" << endl
     << "-i INTERVALL   : User output in search step intervals of this size. " << endl
     << "--ignore ASSEMBLY1,ASSEMBLY2,...  : list of genome assemblies that should be ignored during reading of alignment." << endl
     << "-m 1|0         : Complement mode. If set to 1 (default), search for complementary columns, not matching ones." << endl
     << "--noself       : filter out stems that have equal start and stop position" << endl
     << "-o OUTPUTFILE  : output of cluster intervals in UCSC BED file format." << endl 
     << "--opposite     : Combination mode: corresonds to --strand 1 --strand2 -1 --noself" << endl
     << "--pad N  : Adds flanking regions to red BED format filter intervals. Typical value: 200 for adding 200nt on both sides of each interval. Goes together with options -b, --b1, --b2" << endl
     << "-prune N       : If set, read at most this many sequences per alignment block." << endl
     << "-r 1|0         : Reverse mode. If set to 1 (default), looking for stretches of reverse (complementary) columns." << endl 
     << "--require ASSEMBLY1,ASSEMBLY2,...  : list of genome assemblies that should be required during reading of alignment. All other genome assemblies are being ignored." << endl
     << "-s SEQMIN      : minimum number of sequences for reading of MAF file. Default: 10" << endl
     << "-same-chrom    : force to assume that for two given MAF files, the reference genome is from the same chromosomes." << endl
     << "--stem 1|2|3|4  : minimum stem length." << endl
     << "-t THREADS     : Maximum number of parallel threads (executable covarnap). Value must be greater zero. " << endl
     << "--taboo ASSEMBLY1,ASSEMBLY2,...  : Deprecated; use option --ignore instead. list of genome assemblies that should be ignored during reading of alignment." << endl
     << "-v 0|1|2|3|4   : set verbose level (0:silent, 1: default)" << endl;
//   os << "# Deprecated options:" << endl
//      << "-density       : number of expected stems per area (per sites squared)" << endl
//      << "--dif filename : density input filename. " << endl
//      << "--dof filename : density output filename. " << endl
//      << "--emax VALUE   : maximum evalues of listed interaction clusters. Default: -1 (no maximum e-value specified)" << endl 
//      << "--multi 0|1|2|3  : multiple-testing correction. 0: no clustering; 1: total area; 2: total area / eff. cluster area (default); 3: total area / cluster area" << endl
//      << "-p             : If set, compute p-values and E-values of found stems." << endl
//      << "--search-max NUMBER : ignore columns that would lead to clusters with more than this many columns." << endl
//      << "--shuffle 0|1|2 : shuffling of MAF alignment blocks. 0: no shuffling; 1: dinucleotide-preserving shuffling." << endl
//      << "--stem-p 0..1 : Limits clusters to have smaller P value for stem-bias" << endl    
//      << "--strand 1|-1|0 : strand mode: MAF blocks are converted to plus strand (1) or minus strand (-1) of reference genome. 0: no conversion." << endl
//      << "--strand2 1|-1|0 : strand mode of second MAF alignments: MAF blocks are converted to plus strand (1) or minus strand (-1) of reference genome. 0: no conversion." << endl;

}

/** Adds computed p-value to stems */
void
LocoRnaMain::addPValue(Vec<Stem>& stems, 
		       const CorrelationFinder& finder, bool isInternal) {
  double logPVal;
  for (size_type i = 0; i < stems.size(); ++i) {
    if (isInternal) {
      logPVal = finder.computeLogPValue(stems[i]);
    } else { // convert from assembly to internal coordinates
      Stem stemCopy(stems[i]);
      logPVal = finder.computeLogPValue(CorrelationTools::convertAssemblyToInternalCoordinates(stemCopy, *(finder.getMaf()))); // FIXIT: convert back??
    }
    stems[i].setEnergy(logPVal);
  }
}

/** Adds computed p-value to special "forward" stems */
void
LocoRnaMain::addForwardPValue(Vec<Stem>& stems, const CorrelationFinder& finder, bool isInternal) {
  double logPVal;
  for (size_type i = 0; i < stems.size(); ++i) {
    if (isInternal) {
      logPVal = finder.computeForwardLogPValue(stems[i]);
    } else { // convert from assembly to internal coordinates
      logPVal = finder.computeForwardLogPValue(CorrelationTools::convertAssemblyToInternalCoordinates(stems[i], *(finder.getMaf())));
    }
    stems[i].setEnergy(logPVal);
  }
}


/** Writes individual correlations */
void
LocoRnaMain::writeCorrelations(ostream& os,
			       result_container::const_iterator first,
			       result_container::const_iterator last) const {
  for (result_container::const_iterator it = first; it != last; it++) {
    os << (*it) << endl;
  }
}

// filter out only correlations that go between two alignments
Vec<Correlation>
LocoRnaMain::filterCrossCorrelations(const Vec<Correlation>& corrs, length_type offset, length_type offsetMin) {
  Vec<Correlation> results;
  for (Vec<Correlation>::size_type i = 0; i < corrs.size(); ++i) {
    Correlation corr = corrs[i];
    if ((corr.getStart() < offset) && (corr.getStop() >= offsetMin)) {
      corr.setStop(corr.getStop() - offset);
      results.push_back(corr);
    }
  }
  return results;
}

void
LocoRnaMain::addStemSequence(Stem& stem,
			     const MAFAlignment& maf) { // const MAFAlignment& maf2 ) {
  for (Stem::index_type i = 0; i < stem.getLength(); ++i) {
    MAFAlignment::length_type first1 = stem.getStart();
    MAFAlignment::length_type last1  = stem.getStart() + stem.getLength() - 1;
    MAFAlignment::length_type first2 = stem.getStop() - stem.getLength() + 1;
    MAFAlignment::length_type last2  = stem.getStop(); 
    string seq = maf.extractAssemblySequence(first1, last1);
    string seq2 = maf.extractAssemblySequence(first2, last2);
    stem.setSequence1(seq);
    stem.setSequence2(seq2);
  }
}


void
LocoRnaMain::addStemSequences(Vec<Stem>& stems,
			      const MAFAlignment& maf // const MAFAlignment& maf2,
			      ) {
  for (Vec<Stem>::size_type i = 0; i < stems.size(); ++i) {
    addStemSequence(stems[i], maf); // , maf2);
  }
}

void parameterOutput(ostream& os, int argc, char** argv) {
  for (int i = 0; i < argc; i++)
    {
      os << argv[i] << " ";
    }
  os << endl;
}

int
LocoRnaMain::run() {
  Timer timer;
  timer.start();
  *osp << "# Program called with parameters: ";
  MainTools::parameterOutput(*osp, argc, argv); // output of parameters
  length_type ali2Offset = 0;
  length_type offsetMin = 10000;
  bool clusterAgainMode = false; // true leads to problems with MAF access 
  BEDRegions bed1;
  BEDRegions bed2;
  bool useStemEnergiesAsDensities = false; // an internal constant used in final clustering
  InteractionClusterAnalyzer * clusterAnalyzer = new InteractionClusterAnalyzer; // allocates a lot of memory 
  if (bedFileName1.size() > 0) {
    REMARK << "Reading BED Format data for filtering from " << bedFileName1 << endl;
    ifstream bedFile(bedFileName1.c_str());
    ERROR_IF(!bedFile, "Error opening BED format file: " + bedFileName1);
    bed1.read(bedFile, BEDRegions::STRAND_IGNORE);
    bedFile.close();
    ERROR_IF(refAssembly.size() == 0, 
	     "If BED filter is specified with -b, the reference assembly (example: hg18) has to be specified with option -a");
    bed1.setAssembly(refAssembly); // bed data must be with respect to reference assembly
    REMARK << "Reading BED Format data finished." << endl;
    if (verbose > 1) {
      REMARK << endl << bed1 << endl;
    }
    ERROR_IF(!bed1.validate(), "Internal error: Filter data does not validate!");
  } else {
    ERROR_IF(bed1.validate(), "Internal error: Filter data validates even though it is not defined!");
  }
  if (bedFileName2.size() > 0) {
    if (bedFileName2 == bedFileName1) {
      bed2 = bed1;
    } else {
      REMARK << "Reading BED Format data for filtering from " << bedFileName2 << endl;
      ifstream bedFile(bedFileName2.c_str());
      ERROR_IF(!bedFile, "Error opening BED format file: " + bedFileName2);
      bed2.read(bedFile, BEDRegions::STRAND_IGNORE);
      bedFile.close();
      ERROR_IF(refAssembly.size() == 0, 
	       "If BED filter is specified with -b, the reference assembly (example: hg18) has to be specified with option -a");
      bed2.setAssembly(refAssembly); // bed data must be with respect to reference assembly
      REMARK << "Reading BED Format data finished." << endl;
      if (verbose > 1) {
	REMARK << endl << bed1 << endl;
      }
      ERROR_IF(!bed2.validate(), "Internal error: Filter data does not validate!");
    }
  } else {
    ERROR_IF(bed2.validate(), "Internal error: Filter data validates even though it is not defined!");
  }

  if (padding > 0) {
    if (verbose > 0) {
      REMARK << "Adding flanking regions of up to " << padding << " nt to all filter intervals" << endl;
    }
    bed1.addPadding(padding);
    bed2.addPadding(padding);
    if (verbose > 1) {
      REMARK << "Filter intervals (1) after addition of flanking regions: " << endl;
      REMARK << endl << bed1 << endl;
      REMARK << "Filter intervals (2) after addition of flanking regions: " << endl;
      REMARK << endl << bed2 << endl;
    }
  }
  if (bed1.validate() && (verbose > 0)) {
    REMARK << "Total lengths of filtered regions (1) (per chromosome):" << endl;
    bed1.writeLengthCounts(cout, "# "); // also provide prefix for output
  }
  if (bed2.validate() && (verbose > 0)) {
    REMARK << "Total lengths of filtered regions (2) (per chromosome):" << endl;
    bed2.writeLengthCounts(cout, "# "); // also provide prefix for output
  }
  MAFAlignment maf;
  MAFAlignment maf2; // careful: this was formerly a local variable, but for analysis purposes it is kept at this level. More memory consumption!
  if (refAssembly.size() > 0) {
    maf.setRefAssembly(refAssembly);
  }
  MAFAlignment::count_hash_type assemblyCombLengths1; // stores for each combination of assemblies of first MAF the total number of alignment columns
  MAFAlignment::count_hash_type assemblyCombLengths2; // stores for each combination of assemblies of second MAF the total number of alignment columns
  maf.reserve(MAF_RESERVE);
  if (verbose > 1) {
    REMARK << "Setting verbose level " << verbose << endl;
  }
  maf.setVerbose(verbose);
  maf.setPruneAfter(pruneAfter);
  set<string> requiredAssemblies = tokenizeToSet(requiredAssemblyNames,",;:!");
  maf.setRequiredAssemblies(requiredAssemblies);
  maf.setSeqMin(seqMin);
  maf.setStrandMode(strandMode1);
  set<string> tabooAssemblies = tokenizeToSet(tabooAssemblyNames,",;:%!");
  if (shuffleMode == 2) {
    REMARK << "WARNING: MAF alignments are shuffled during reading!" << endl;
    maf.setShuffleMode(true);
  }
  maf.setTabooAssemblies(tabooAssemblies);
  ifstream ifs(filename.c_str());
  ERROR_IF(!ifs, "Error reading input file!");
  REMARK << "Reading " << argv[1] << endl;
  Timer readTimer;
  readTimer.start();
  if (bed1.validate()) {
    maf.read(ifs, bed1, blockMin, blockMax); // read using BED filter
  } else {
    BEDRegions emptyBed;
    maf.read(ifs, emptyBed, blockMin, blockMax); // read not using BED filter
  }
  readTimer.stop();
  ifs.close();
  bed1 = BEDRegions(); // intervall data not needed anymore, save memory
  REMARK << "Successfully read alignment data with " << maf.size() << " alignment blocks and " << maf.getTotalLength() << " columns." << endl;
  REMARK << "Total number of initially stored characters: " << maf.computeCharacterCount() << endl;
  // length_type endcol = maf.getTotalLength();
  if (refAssembly.size() == 0) {
    refAssembly = maf.getRefAssembly();
    //     SequenceAlignment::properties_type::const_iterator pit = maf[0].getSequenceProperties(0).find("assembly");
    //     if (pit != maf[0].getSequenceProperties(0).end()) {
    //       refAssembly = pit->second;
    //     }
    //     REMARK << "No reference assembly specified. Using information from first sequence of first alignment block: " << refAssembly << endl; 
  }
  if (verbose > 2) {
    REMARK << "RefChromStarts field: " << maf << ends;
  }
  if (collapseAssembly.size() > 0) {
    REMARK << "Collapsing alignments with respect to this assembly: " << collapseAssembly << " ... " << endl;
    maf.collapseAssembly(collapseAssembly);
    REMARK << "New total length: " << maf.getTotalLength() << endl;
    REMARK << "New total number of stored characters: " << maf.computeCharacterCount() << endl;
  }
  if (shuffleMode == 3) {
    REMARK << "Shuffling first set of alignment blocks vertically ...";
    maf.shuffleVertical();
    cout << " Done." << endl;
  }
  assemblyCombLengths1 = maf.countAssembliesHashLengths();
  length_type searchMax = maf.getTotalLength(); // might be changed later
  length_type totalLength1 = maf.getTotalLength();
  length_type totalLength2 = 0; // no second alignment defined yet
  length_type refAssemblyTotLength1 = maf.getRefAssemblyTotLength();
  length_type refAssemblyTotLength2 = maf.getRefAssemblyTotLength(); // override later
  string refAssemblyChrom = maf.getRefAssemblyChrom();
  string refAssemblyChrom2 = maf.getRefAssemblyChrom();
  if (appendFileName.size() > 0) {
    corrDistMin = 0; // corrDistMin > 0 does not make sense in two-sequence mode; only search inter-sequence correlations
    if (appendFileName == filename) {
      REMARK << "WARNING: the names of the two specified alignment files are identical. This can lead to a skewed analysis; better provide this filename only once, not using the -e option" << endl;
    }
    ifstream ifs2(appendFileName.c_str());
    ERROR_IF(!ifs2, "Error reading input file!");
    if (refAssembly.size() > 0) {
      maf2.setRefAssembly(refAssembly);
    }
    maf2.setPruneAfter(pruneAfter);
    maf2.setRequiredAssemblies(tokenizeToSet(requiredAssemblyNames,","));
    maf2.reserve(MAF_RESERVE);
    maf2.setSeqMin(seqMin);
    if (shuffleMode == 2) {
      maf2.setShuffleMode(true);
    }
    maf2.setStrandMode(strandMode2);
    maf2.setTabooAssemblies(tokenizeToSet(tabooAssemblyNames,","));
    maf2.setVerbose(verbose);
    REMARK << "Starting to read second alignment " << appendFileName << " ... " << endl;
    if (bed2.validate()) {
      maf2.read(ifs2, bed2, blockMin2, blockMax2);
    } else {
      BEDRegions emptyBed;
      maf2.read(ifs2, emptyBed, blockMin2, blockMax2);
    }
    ifs2.close();
    REMARK << "Successfully read second alignment with " << maf2.size() << " alignment blocks and " << maf2.getTotalLength() << " columns. Appending to first alignment:" << endl;
    REMARK << "Total number of initially stored characters of second alignment : " << maf2.computeCharacterCount() << endl;
    refAssemblyTotLength2 = maf2.getRefAssemblyTotLength();
    refAssemblyChrom2 = maf2.getRefAssemblyChrom();
    bed2 = BEDRegions(); // intervall data not needed anymore, save memory
    if (collapseAssembly.size() > 0) {
      REMARK << "Collapsing alignments of second MAF with respect to this assembly: " << collapseAssembly << " ... " << endl;
      maf2.collapseAssembly(collapseAssembly);
      REMARK << "New total length of second MAF: " << maf2.getTotalLength() << endl;
      REMARK << "New total number of stored characters of second alignment : " << maf2.computeCharacterCount() << endl;
    }
    assemblyCombLengths2 = maf2.countAssembliesHashLengths();
    ali2Offset = maf.getChromStart(maf.size() -1, refAssembly) + maf[maf.size()-1].getLength() + offsetMin; // these are assembly coordinates
    REMARK << "Debug info: alignment offset 2: " << ali2Offset << endl;
    if (ali2Offset > 0) {
      if (verbose > 1) {
	REMARK << "Temporarily adding genome position offset of " << ali2Offset << " to second MAF: " << endl; 
      }
      // REMARK << "Debug info 1:" << maf.getChromStart(maf.size()-1, refAssembly) << " " << maf2.getChromStart(0, refAssembly) << endl;
      maf2.addChromStartOffset(ali2Offset, refAssembly); 
      // REMARK << "Debug info 2:" <<  maf.getChromStart(maf.size()-1, refAssembly) << " " << maf2.getChromStart(0, refAssembly) << endl;
      ASSERT(maf.getChromStart(maf.size()-1, refAssembly) + static_cast<length_type>(maf[maf.size()-1].getLength()) 
	     < maf2.getChromStart(0, refAssembly));
    }
    if (collapseAssembly.size() > 0) {
      REMARK << "Collapsing second alignments with respect to this assembly: " << collapseAssembly << " ... " << endl;
      maf2.collapseAssembly(collapseAssembly);
      REMARK << "New total length of second MAF alignment: " << maf2.getTotalLength() << endl;
    }
    if (shuffleMode == 3) {
      REMARK << "Shuffling second set of alignment blocks vertically ...";
      maf2.shuffleVertical();
      cout << " Done." << endl;
    }
    totalLength2 = maf2.getTotalLength();
    maf.append(maf2);
    searchMax = maf.getTotalLength(); // new: search WHOLE length, because of details regarding columns
    if (ali2Offset != 0) {
      maf2.addChromStartOffset(-ali2Offset, refAssembly);  // subtracting offset again
      if (verbose > 2) {
	REMARK << "Subtracting again offset for alignment 2: " << ali2Offset << endl;
	// REMARK << "Debug info 3:" <<  maf.getChromStart(maf.size()-1, refAssembly) << " " << maf2.getChromStart(0, refAssembly) << endl;
	// REMARK << "Debug info 4:" <<  maf.getRefChromStarts() << endl;
      }
    }
  } else {
    bed2 = BEDRegions(); // intervall data not needed anymore, save memory
  }
  switch (shuffleMode) {
  case 0: break; // no shuffling;
  case 1: {
    REMARK << "Shuffling alignments!" << endl;
    bool shuffleColumnMode = true; // shuffle rows AND columns
    maf.dinucleotideShuffle(shuffleNormLimit, shuffleColumnMode);
    // maf2.dinucleotideShuffle(shuffleNormLimit, iterations, shuffleColumnMode); // maf2 not needed any more
  }
    break;
  case 2:
    REMARK << "Alignment blocks where shuffled horizontally during reading." << endl;
    break;
  case 3: 
    REMARK << "Alignment blocks where shuffled vertically after reading." << endl;
    break;
  default: ERROR("Unknown shuffle mode! Allowed values: 0: no shuffling; 1: Dinucleotide shuffling (deprecated); 2: horizontal shuffling (deprecated); 3: vertical shuffling");
  }
  ASSERT(maf.validateColumnIds());
  REMARK << "Initializing search between " << refAssembly << " " 
	 << refAssemblyChrom << " ( " << refAssemblyTotLength1 << " ) and " 
	 << refAssemblyChrom2 << " ( " << refAssemblyTotLength2 << " ) " << endl;
  if (refAssemblyChrom == refAssemblyChrom2) {
    REMARK << "Searching within same chromosome." << endl;
    // sameChrom = true; // do not change: sameChrom indicates if rectangle or triangle area is searched. Simple check if option -e was given for second alignment 
  }

  if (verbose > 0) {
    REMARK << "Starting to initialize finder..." << endl;
  }
  HashCorrelationFinder3::result_type resultBins(maf.getTotalLength());
  for (size_type i = 0; i < resultBins.size(); ++i) {
    resultBins[i] = (new HashCorrelationFinder3::result_vector_type());  
  }
  Timer hashTimer;
  hashTimer.start();
  CorrelationFinder * finder;
  bool pairMode = false; // decides whether to use nucleotide pair or triplet mode.
  if (pairMode) {
    if (verbose > 1) {
      REMARK << "Using nucleotide pair mode." << endl;
    }
    ERROR("Nucleotide pair mode not implemented.");
    // MAFSearchTables * tables = new MAFSearchTables();
    // tables->setMAF(&maf);
    // tables->setAssemblyPairFraction(assemblyPairFraction);
    // // if (appendFileName.size() > 0) { // in this mode, search results have to be part of second MAF, queries are part of first MAF:
    // //   tables->setSearchRangeMin(searchMax+1);
    // // }
    // tables->setVerbose(verbose);
    // tables->run(); // generate hash tables
    // ERROR_IF(!tables->validate(), "Internal error: search tables did not validate!");
    // finder = new HashCorrelationFinder(&maf, tables, &resultBins);
  } else {
    if (verbose > 1) {
      REMARK << "Using nucleotide triplet mode." << endl;
    }
    MAFSearchTables3 * tables = new MAFSearchTables3;
    tables->setAmbiguityMode(ambiguityMode);
    tables->setMAF(&maf);
    tables->setAssemblyPairFraction(assemblyPairFraction);
    //    if (appendFileName.size() > 0) { // in this mode, search results have to be part of second MAF, queries are part of first MAF:
    //     tables->setSearchRangeMin(searchMax+1);
    //   }
    tables->setVerbose(verbose);
    tables->run(refAssembly);
    ERROR_IF(!tables->validate(), "Internal error: search tables did not validate!");
    finder = new HashCorrelationFinder3(&maf, tables, &resultBins);
    if (appendFileName.size() > 0) { // in this mode, search results have to be part of second MAF, queries are part of first MAF:
      finder->setSearchColumnSplit(totalLength1); // split search : only interested in finding covariations that span the two regions.
    }
  }
  hashTimer.stop();
  finder->setBasepairTypeMin(basepairTypeMin);
  finder->setClusterCutoffAndInit(static_cast<length_type>(clusterCutoff));
  finder->setCheckAntiNeighborMode(checkAntiNeighborMode);
  finder->setClusterFilterActive(clusterFilterActive);
  finder->setClusterFilterSizeMin(clusterColMin);
  finder->setComplementMode(complementMode);
  finder->setCorrDistMin(corrDistMin);
  finder->setNonGapMin(seqMin);
  finder->setOutIntervall(outputIntervall);
  finder->setSearchColumnMax(searchColumnMax);
  finder->setStemLengthMin(stemLengthMin);
  finder->setReverseMode(reverseMode);
  finder->setVerbose(verbose);
  ASSERT(finder->getClusterFilter().isActive() == clusterFilterActive);
  if (verbose > 0) {
    REMARK << "Successfully initialized finder." << endl;
  }
  ASSERT(finder->getClusterFilter().isActive() == clusterFilterActive);
  REMARK << "Starting search!" << endl;
  #ifdef COVARNA_CONCURRENT_VECTOR
  blocked_range<length_type> range(0, searchMax);
  #else
  ser_blocked_range<length_type> range(0, searchMax);
  #endif
  Timer searchTimer;
  searchTimer.start();
  ASSERT(finder->getClusterFilter().isActive() == clusterFilterActive);
  finder->run(range);  // central method that starts the search for covariation
  ASSERT(finder->getClusterFilter().isActive() == clusterFilterActive);
  searchTimer.stop();
  REMARK << "Search finished!" << endl;//  Fraction of ignored columns (%): " << setprecision(3) << (100.0 * static_cast<double>(finder.getIgnoredCount()) / maf.getTotalLength()) << endl;
  result_container results = finder->getResults();

  REMARK << "Found " << results.size() << " correlations." << endl;
  if (appendFileName.size() > 0) {
    result_container resultsTmp;
    for (result_container::size_type jj = 0; jj < results.size(); ++jj) {
      // cout << "# checking " << results[jj] << "\t" << totalLength1 << "\t" << totalLength2 << "\t" << searchMax << " : ";
      if (((results[jj].getStart() < totalLength1) && (results[jj].getStop() >= totalLength1))
	  || ((results[jj].getStart() >= totalLength1) && (results[jj].getStop() < totalLength1)) ) {
	resultsTmp.push_back(results[jj]);
      }  else {
      }
    }
    if (resultsTmp.size() < results.size()) {
      results = resultsTmp;
    }
    REMARK << "Only kept " << results.size() << " correlations that span the two specified genomic regions." << endl;
  }
  REMARK << "Cluster filter status: " << finder->getClusterFilter().isActive() << endl; 
  ASSERT(finder->getClusterFilter().isActive() == clusterFilterActive);
  InteractionClusterAnalyzer::double_hash_type densities;
  if (densInFileName.size() > 0) {
    ifstream densInFile(densInFileName.c_str());
    ERROR_IF(!densInFile,"Error reading dens in file!");
    REMARK << "Reading densities from file " << densInFileName << endl;
    densities = InteractionClusterAnalyzer::readDensities(densInFile);
    densInFile.close();
  } else {
    bool addEmpty = true; // densities for areas without apparent covariation will be added later with augmentDensities
    if (densOutFileName.size() > 0) {
      ofstream densOutFile(densOutFileName.c_str());
      ERROR_IF(!densOutFile, "Error writing density file " + densOutFileName);
      REMARK << "Writing densities to file " << densOutFileName << endl;
      // InteractionClusterAnalyzer::writeDensities(densOutFile, densities, &densOutFile);
      densities = finder->computeDensities(searchMax, addEmpty, &densOutFile); // HashCorrelationFinder3::computeDensities();
      densOutFile.close();
    } else {
      // densities = finder->computeDensities(searchMax, addEmpty, NULL); // HashCorrelationFinder3::computeDensities();
    }
    // if (assemblyCombLengths2.size() > 0) {
      // densities = InteractionClusterAnalyzer::generateDensities(HashCorrelationFinder3::getMatchPairCountHash(), assemblyCombLengths1, assemblyCombLengths2);
      // densities = InteractionClusterAnalyzer::generateDensities(HashCorrelationFinder3::getMatchPairCountHash2(), assemblyCombLengths1, assemblyCombLengths2;)
    // } else {
      // densities = InteractionClusterAnalyzer::generateDensities(HashCorrelationFinder3::getMatchPairCountHash(), assemblyCombLengths1);
      // densities = InteractionClusterAnalyzer::generateDensities(HashCorrelationFinder3::getMatchPairCountHash2(), assemblyCombLengths1);
      // densities = finder->computeDensities(); // HashCorrelationFinder3::computeDensities();
    // }
  }
  // after densities have been written to file, augment densities of areas in which no covariation was found:
  // finder->augmentDensities(densities, searchMax); // new: taken out

  if (verbose > 4) { // output of unfilteed covaying columns
    REMARK << "Unfiltered results: " << endl;
    finder->writeRawResults(cout, searchMax);
  }
  // the found covariation columns will be augmented with densities:
  // InteractionClusterAnalyzer::augmentCorrelationDensitities(results, densities, maf, maf2, true); // true : still working in internal coordinate mode
  if (verbose > 1) {
    writeResults(*osp, maf, results, refAssembly); // write output
  }
  if (refAssembly.size() > 0) {
    if (verbose > 0) {
      REMARK << "Convert to coordinates of assembly (first column has index one): " << refAssembly << endl;
    }
    CorrelationTools::convertInternalToAssemblyCoordinates(results.begin(), results.end(), refAssembly, maf);
  }
  REMARK << "Number of remaining correlations: " << results.size() << endl;
  if (ali2Offset != 0) {
    REMARK << "Filtering such that only cross-correlations remain... ";
    results = filterCrossCorrelations(results, ali2Offset, offsetMin);// filter out only correlations that go between two alignments
    REMARK << results.size() << endl;
  }
  REMARK << "Number of remaining correlations: " << results.size() << endl;
  if (annotateInFileName.size() > 0) {
    ifstream annotateInFile(annotateInFileName.c_str());
    ERROR_IF(!annotateInFile, "Error opening annotation file: " + annotateInFileName);
    AnnotationTools annotator;    
    if (annotateOutFileName.size() ==  0) {
      annotateOutFileName = annotateInFileName + "_anno.bed";
    }
    ofstream annotateOutFile(annotateOutFileName.c_str());
    ERROR_IF(!annotateOutFile, "Error opening annotation output file: " + annotateOutFileName);      
    if (verbose > 0) {
      REMARK << "Annotated regions read from " << annotateInFileName << " are written to file " << annotateOutFileName << endl;
    }
    annotator.annotateRegions(annotateInFile, annotateOutFile, results, refAssemblyChrom, refAssemblyChrom2, static_cast<length_type>(ceil(clusterCutoff)));
    annotateInFile.close();
    annotateOutFile.close();
  }
  Vec<Stem> stems;
  if (reverseMode) {
    if (complementMode) {
      REMARK << "Converting found correlations into regular reverse-complement stems:" << endl;
    } else {
      REMARK << "Converting found correlations into reverse-matching (not complementary) stems:" << endl;
    }
    if (appendFileName.size() == 0) {
      stems = CorrelationTools::convertCorrelationsToStems(results, corrDistMin, stemLengthMin);
    } else {
      stems = CorrelationTools::convertCorrelationsToStems(results, 0, stemLengthMin); // corrDistMin does not make sense for two-sequence mode
    }
    if (pvalMode) {
      if (verbose > 0) {
	REMARK << "Computing E-values..." << endl;
      }
      addPValue(stems, *finder, false); // false: using assembly coordinates, not internal coordinates
      if (verbose > 0) {
	REMARK << "Computing E-values finished." << endl;
      }
    }
  } else {
    REMARK << "Due to chosen mode (reverse: " << reverseMode << " complement: " << complementMode << ") , cannot convert found correlations into regular reverse-complement stems, using instead \"forward\" regions:"  << endl;
    if (appendFileName.size() == 0) {
      stems = CorrelationTools::convertCorrelationsToForwardStems(results, corrDistMin, stemLengthMin);
    } else {
      stems = CorrelationTools::convertCorrelationsToForwardStems(results, 0, stemLengthMin); // corrDistMin does not make sense for two-sequence mode
    }
    if (pvalMode) {
      if (verbose > 0) {
	REMARK << "Computing E-values..." << endl;
      }
      addForwardPValue(stems, *finder, false);
      if (verbose > 0) {
	REMARK << "Computing E-values finished." << endl;
      }
    }
  }
  addStemSequences(stems, maf); // , maf2);
  // convert notation to + strand
  if (strandMode1 == MAFAlignment::STRAND_MINUS) {
    // convert start position to plus strand
    reverseStemStarts(stems, refAssemblyTotLength1);
  }
  if (strandMode2 == MAFAlignment::STRAND_MINUS) {
    // convert start position to plus strand
    reverseStemStops(stems, refAssemblyTotLength2);
  }
  if (noSelfMode) {
    stems = filterNoSelfStems(stems); // filter out self stems (from stemhelp)
  }
  // cluster again using SingleLinkage2DProgressiveFilter:
  if (clusterAgainMode) {
    if (reverseMode && complementMode) {
    ERROR("Re-clustering using single-linkage filtering is currently not implemented.");
    SingleLinkage2DProgressiveFilter clusterFilter = finder->getClusterFilter(); // makes a copy
    ERROR_IF(clusterFilter.getElementCount() > 0, "Internal error: cluster filter should be empty after search.");
    clusterFilter.setDelay(0); // actually changing filter behavior, because not used for parallel mode anymore
    clusterFilter.reset(); // prepare for another search
    REMARK << "Clustering again using single-linkage filter with initially " << stems.size() << " stems. Result: " << endl;
    stems = CorrelationTools::singleLinkageFilter(stems, clusterFilter);
    REMARK << stems.size() << " stems." << endl;
    } else { // other cases
      ERROR("Sorry, cluster-again mode not implemented for modes other than reverse-complement search.");
    }
  }
  if (verbose > 1) {
    REMARK << "Initial list of " << stems.size() << " found stems:" << endl;
    MainTools::writeStems(*osp, stems, totalLength1, totalLength2, pvalMode,reverseMode);
  }
  Timer clusterTimer;
  clusterTimer.start();
  REMARK << "Writing results to file " << bedOutFileName << endl;
  ofstream bedOutFile(bedOutFileName.c_str());
  if (bedOutFile) {
    bedOutFile << "# Result generated by COVARNA (version " << getVersion() << " ) " << endl;
    bedOutFile << "# Parameters: ";
    MainTools::parameterOutput(bedOutFile, argc, argv); // output of parameters
  }

  if (verbose > 1) {
    writeHash(cout, HashCorrelationFinder3::getMatchPairCountHash());
    REMARK << "Assembly lengths for first MAF:" << endl;
    writeHash(cout, assemblyCombLengths1);
    REMARK << "Assembly lengths for second MAF:" << endl;
    writeHash(cout, assemblyCombLengths2);
  }
  if ((stemDensity <= 0.0) && (stemLengthMin == 1)) {
    double totalArea = static_cast<double>(totalLength1) * static_cast<double>(totalLength2);
    if (totalArea <= 0.0) {
      totalArea = static_cast<double>(totalLength1) * static_cast<double>(totalLength1);
    }
    stemDensity = HashCorrelationFinder3::getMatchPairCount() / totalArea;
    if (verbose > 1) {
      cout << "Estimating density of column pairs as " << stemDensity << " based on count of " 
	   << HashCorrelationFinder3::getMatchPairCount() << " matches in an area of " << totalArea << " nucleotides squared." << endl;
    }
  }
  clusterAnalyzer->analyzeClustersFast(cout, bedOutFile, stems, clusterCutoff, static_cast<double>(totalLength1),
				   static_cast<double>(totalLength2), 
				   sameChrom, stemDensity, eMax, stemPMax, stemLengthMin, clusterColMin, 
				   expandClusterMaxAllowed, 
                                   complementMode, reverseMode, maf, maf2, 
				   HashCorrelationFinder3::getMatchPairCountHash(), assemblyCombLengths1, assemblyCombLengths2, useStemEnergiesAsDensities,
				       multiTestMode, densities); // , stemBiasPMode);
  clusterTimer.stop();
  timer.stop();

  if (bedOutFile) {
    bedOutFile.close();
  }
  REMARK << "Compute time : Total: " << timer << " Reading: " << readTimer << " Hash-creation: " << hashTimer 
	 << " Searching: " << searchTimer << " Clustering: " << clusterTimer << endl;

  REMARK << "Good bye!" << endl;
  
  return 0;
}
