/*=======================================================================
 *** THE CONTENT OF THIS WORK IS PROPRIETARY TO FEI S.A.S, (FEI S.A.S.),            ***
 ***              AND IS DISTRIBUTED UNDER A LICENSE AGREEMENT.                     ***
 ***                                                                                ***
 ***  REPRODUCTION, DISCLOSURE,  OR USE,  IN WHOLE OR IN PART,  OTHER THAN AS       ***
 ***  SPECIFIED  IN THE LICENSE ARE  NOT TO BE  UNDERTAKEN  EXCEPT WITH PRIOR       ***
 ***  WRITTEN AUTHORIZATION OF FEI S.A.S.                                           ***
 ***                                                                                ***
 ***                        RESTRICTED RIGHTS LEGEND                                ***
 ***  USE, DUPLICATION, OR DISCLOSURE BY THE GOVERNMENT OF THE CONTENT OF THIS      ***
 ***  WORK OR RELATED DOCUMENTATION IS SUBJECT TO RESTRICTIONS AS SET FORTH IN      ***
 ***  SUBPARAGRAPH (C)(1) OF THE COMMERCIAL COMPUTER SOFTWARE RESTRICTED RIGHT      ***
 ***  CLAUSE  AT FAR 52.227-19  OR SUBPARAGRAPH  (C)(1)(II)  OF  THE RIGHTS IN      ***
 ***  TECHNICAL DATA AND COMPUTER SOFTWARE CLAUSE AT DFARS 52.227-7013.             ***
 ***                                                                                ***
 ***                   COPYRIGHT (C) 1996-2019 BY FEI S.A.S,                        ***
 ***                        BORDEAUX, FRANCE                                        ***
 ***                      ALL RIGHTS RESERVED                                       ***
**=======================================================================*/
/*=======================================================================
** Author      : Mike Heck (Nov 2005)
**=======================================================================*/

/*----------------------------------------------------------------------------------------
 * LoadDataAsynch
 * Example: Load LDM data for computation - using asynchronous loading
 *
 * Builds on the example "LoadData"
 *
 * Please see the associated programming guide document:
 *    PG-DataManagementVolVizLDM.pdf
 * located in the SDK directory .../examples/source/VolumeViz/Compute
 *
 * Note this example is optimized to load a data volume, e.g. seismic
 * amplitudes, already in LDM format.  The default data file below is
 * a SEGY file which is provided with the Open Inventor SDK.  The
 * example still works because VolumeViz will automatically convert
 * the data to LDM format "on the fly".  However this conversion will
 * reduce performance compared to loading an LDM format file.
 *
 * VolumeViz LDM already uses multiple threads to load multiple tiles
 * in parallel.  This is an automatic parallelization provided by the
 * library and helps overlap waiting for disk or network latencies.
 * However it does not allow the application to completely overlap data
 * loading with computation, which is an increasingly critical goal on
 * multi-core processors. For this we will need the asynchronous data
 * access methods and a slightly more complex program structure.
 *
 * Using the synchronous methods we needed 2 data buffers, one to hold
 * the input data and one to hold the output data (the result of our
 * computation). Using the asynchronous methods we will need (at least)
 * 3 data buffers. We will always use one buffer for the result of the
 * computation, but at any given time we will be using one buffer for
 * input for the computation and VolumeViz will be filling the third
 * buffer with the next data block.
 *
 * Before we start we need to "prime the pump" by loading the first
 * data block synchronously.  Then we can submit a request for the 2nd
 * data block and begin computing on the first block while the 2nd
 * block is being loaded.  When we are finished computing the first
 * block we may need to wait for the loading of the second block to
 * finish.  When the second block is loaded we can swap the buffer
 * pointers, request the third block and begin computing the second
 * block.  In pseudo-code it looks like this:
 *
 *   pDataBuf = Buffer0;
 *   pLoadBuf = Buffer1;
 *   getData( block, pDataBuf ); // Synchronous get 1st block
 *   while (moreBlocks) {
 *     requestData( block++, pLoadBuf ); // Asynch request
 *     compute( pDataBuf, pResultBuf );  // Compute loaded block
 *     waitForData();                    // Wait for asynch block
 *     temp = pDataBuf;
 *     pDataBuf = pLoadBuf;
 *     pLoadBuf = temp;
 *   }
 *   compute( pDataBuf, pResultBuf ); // Compute the last block
 *
----------------------------------------------------------------------------------------*/

#include <stdio.h>
#include <stdlib.h>
#include <cinttypes>

#include <Inventor/SoDB.h>
#include <Inventor/SbElapsedTime.h>

#include <VolumeViz/nodes/SoVolumeData.h>
#include <VolumeViz/readers/SoVRSegyFileReader.h>

#include <Inventor/helpers/SbFileHelper.h>

////////////////////////////////////////////////////////////////////////
// Constants

// Default data file
SbString DefaultDataFile = SbFileHelper::expandString ("$OIVHOME/examples/data/VolumeViz/Waha8.sgy");

// Enable more verbose output (will affect processing time)
static int debugFlag = 1;

// Estimated number of values we can compute per second
// (used to simulate computation time)
const float COMPUTE_PER_SEC = 5e6f;

SbString outputFileName;
FILE* outputFILE=NULL;

////////////////////////////////////////////////////////////////////////
// Local functions

// Pretty print numbers for benchmark results
char *formatNumberApproximate( double _value, char *_buffer );

// Do the computation (whatever it is) on one block of data
int doComputation( void* pSrcBuffer, void* pDstBuffer,
                   SbVec3i32 actDim, SoVolumeData::DataType dataType );

////////////////////////////////////////////////////////////////////////
// Custom data access class

class MyAsynchLDMDataAccess : public SoLDMDataAccess
{
public:
  // Override standard requestData method.
  // Set data-not-ready before submitting request to parent class
  int requestData( int resolution, const SbBox3i32& subVolume, SoBufferObject* buffer)
  {
    m_dataReady = false;
    return SoLDMDataAccess::requestData( resolution, subVolume, buffer );
  }

  // SoLDMDataAccess does not provide a blocking call to wait for the
  // data to be ready, so wait for callback to set our data ready flag.
  void waitForData( int /*requestId*/ )
  {
    while (! m_dataReady) {
      SbTime::sleep( 10 );
    }
  }

protected:
  bool m_dataReady;

  // This method will be called from a VolumeViz data loader thread.
  // Set data-ready flag so the waitForData method can return.
  void endRequest(int /*requestId*/)
  {
    m_dataReady = true;
  }
};

////////////////////////////////////////////////////////////////////////

int main( int argc, char **argv )
{
  // Get filename and check that it exists
  char *filename = NULL;

  int iarg;
  if (argc > 1)
  {
    filename = argv[1];
    
    for ( iarg = 2; iarg < argc; iarg++ ) 
    {
      if (argv[iarg][0] == '-') 
      {
        char option = argv[iarg][1];
        switch (option) 
        {
        case 'o':
          iarg++;
          outputFileName = SbString(argv[iarg]);
          outputFILE = fopen(outputFileName.getString(), "a");
          break;
        }
        continue;
      }
    }
  }

 if ( outputFILE == NULL )
    outputFILE = stderr; 

 if (filename == NULL) 
    filename = (char*)DefaultDataFile.toLatin1(); // default
 
 FILE *fp = fopen( filename, "r" );
 if (!fp) 
 {
    fprintf( stderr, "Unable to open input file '%s'\n", filename );
    return -1;
  }
  fclose( fp );

  fprintf(outputFILE, "Input: '%s'\n", filename );

  // Initialize (don't need any window system classes)
  SoDB::init();
  SoVolumeRendering::init();

  // Load the volume
  SoVolumeData *pVolData = new SoVolumeData();
  pVolData->ref();
  pVolData->fileName.setValue( filename );

  // Get some information about the volume
  SbBox3f   volSize  = pVolData->extent.getValue();
  SbVec3i32 volDim   = pVolData->data.getSize();
  int       bytesPV  = pVolData->getDataSize(); // BytesPerValue
  SbVec3i32 tileDim  = pVolData->ldmResourceParameters.getValue()->tileDimension.getValue();
  int       tileSize = tileDim[0];
  SoVolumeData::DataType dataType = pVolData->getDataType();

  fprintf(outputFILE, "Volume dimensions: %d %d %d\n"
          "       bytesPerValue: %d  tileSize: %d\n",
          volDim[0], volDim[1], volDim[2], bytesPV, tileSize );

  // Estimate number of full resolution tiles
  int xtiles = (int)ceil( (float)volDim[0] / tileDim[0]);
  int ytiles = (int)ceil( (float)volDim[1] / tileDim[1]);
  int ztiles = (int)ceil( (float)volDim[2] / tileDim[2]);

  // The number of values in one data block is the number of values in
  // a single tile times the number of tiles in a block, which in this
  // case is the number of tiles on the X axis of the volume.
  //
  // Similarly the number of data blocks we need to process is (in this
  // case) the number of Y tiles times the number of Z tiles.
  unsigned int valuesPerTile  = tileDim[0] * tileDim[1] * tileDim[2];
  unsigned int valuesPerBlock = xtiles * valuesPerTile;
  int64_t valuesTotal = (int64_t)volDim[0] * volDim[1] * volDim[2];
  unsigned int numBlocks = ytiles * ztiles;

  fprintf(outputFILE, "       Values in tile: %d  in block: %d  total: %" PRId64 "\n",
    valuesPerTile, valuesPerBlock, valuesTotal );
  fprintf(outputFILE, "       Tiles in block: %d  in volume: %d\n",
    xtiles, xtiles * ytiles * ztiles );
  fprintf(outputFILE, "       Blocks to process: %d\n", numBlocks );

  // Allocate memory for input data blocks
  //
  // Note we need two data buffers for asynchronous access so we can
  // compute on one buffer while the other buffer is being filled.
  //
  // The number of bytes in one stack is the number of data values
  // times the number of bytes in one value.  Note this number is very
  // conservative because it assumes there are no partial tiles.
  size_t numBytes = valuesPerBlock * bytesPV;
  SoRef<SoCpuBufferObject> pSrcBuffer0 = new SoCpuBufferObject();
  pSrcBuffer0->setSize(numBytes);

  SoRef<SoCpuBufferObject> pSrcBuffer1 = new SoCpuBufferObject();
  pSrcBuffer1->setSize(numBytes);

  // Allocate memory for output data block
  SoRef<SoCpuBufferObject> pDstBuffer = new SoCpuBufferObject();
  pDstBuffer->setSize(numBytes);

  // Create a data access object and associate it witn our data volume.
  // This has to be an instance of our own data access class so we
  // could override the endRequest method.
  //
  // We also need a data access result object.
  MyAsynchLDMDataAccess asynchDataAccess;
  asynchDataAccess.setDataSet( pVolData );
  SoVolumeData::LDMDataAccess::DataInfoBox dataInfo;

  // Some local variables for convenience
  SbBox3i32 subvol;
  SbVec3i32 actDim;

  // Create and initialize timers
  // Note: We can't completely measure load time when using asynch access
  //       because loading and computing should be overlapped. However
  //       we will measure load time for the first block, request time
  //       (should be small) and wait time (hopefully will be small).
  int64_t bytesLoaded = 0;
  int     blocksLoaded = 0;
  SbElapsedTime loadTimer, compTimer, totalTimer;
  double loadTime, compTime, totalTime;
  loadTime = compTime = totalTime = 0;
  totalTimer.reset();

  // These are the buffer pointers we will swap during the algorithm
  SoCpuBufferObject *pDataBuffer = pSrcBuffer0.ptr();
  SoCpuBufferObject *pLoadBuffer = pSrcBuffer1.ptr();

  // Compute bounds of first data block
  subvol.setBounds( 0, 0, 0, volDim[0]-1, tileDim[1]-1, tileDim[2]-1 );

  // Load first block synchronously (we have to wait anyway)
  //
  // NOTE: We cannot use our separate data access object (the
  //       variable named asynchDataAccess) to do synchronous
  //       getData calls. Must use volume's built-in data access.
  printf( "Loading first block...\n" );
  loadTimer.reset();
  dataInfo = pVolData->getLdmDataAccess().getData( 0, subvol, pDataBuffer );
  if (dataInfo.errorFlag != SoVolumeData::LDMDataAccess::CORRECT) {
    printf( "*** ERROR loading block 0... error=%d\n", dataInfo.errorFlag );
  }
  actDim = dataInfo.bufferDimension;
  blocksLoaded++;
  bytesLoaded += dataInfo.bufferSize;
  loadTime += loadTimer.getElapsed();

  // Setup for calculating data blocks on tile boundaries
  //
  // Tiles always contain exactly tileDim voxels.
  // We'll limit the region to the actual volume dimensions.
  //
  // We don't really need to precompute these values, the extra math
  // wouldn't slow down the loop much.  It's just clearer this way.
  int yinc   = tileDim[1];
  int zinc   = tileDim[2];
  int xLimit = volDim[0] - 1;
  int yLimit = volDim[1] - 1;
  int zLimit = volDim[2] - 1;

  bool firstBlock = true;
  printf( "Processing data blocks...\n" );

  // Loop over all stacks (data blocks)
  for (int iz = 0; iz < ztiles; ++iz) {
    for (int iy = 0; iy < ytiles; ++iy) {

      // If this is the first block, well we already loaded it!
      if (firstBlock) {
        iy++;
        firstBlock = false;
      }

      // Compute bounds of this stack of tiles
      int xmin = 0;
      int xmax = xLimit;
      int ymin = iy * yinc;
      int ymax = ymin + tileDim[1] - 1;
      if (ymax > yLimit)
        ymax = yLimit;
      int zmin = iz * zinc;
      int zmax = zmin + tileDim[2] - 1;
      if (zmax > zLimit)
        zmax = zLimit;
      subvol.setBounds( xmin, ymin, zmin, xmax, ymax, zmax );

      if (debugFlag == 2)
        printf( "  Loading block %3d of %d : %d %d %d -> %d %d %d\n",
          blocksLoaded+1, numBlocks, xmin, ymin, zmin, xmax, ymax, zmax );
      else if (debugFlag > 0)
        printf( "  %d of %d\r", blocksLoaded+1, numBlocks );

      // Request next block of data (this call returns immediately)
      loadTimer.reset();
      int requestId = asynchDataAccess.requestData( 0, subvol, pLoadBuffer );
      loadTime += loadTimer.getElapsed();

      // Do computation on block already loaded
      compTimer.reset();
      doComputation( pDataBuffer, pDstBuffer.ptr(), actDim, dataType );
      compTime += compTimer.getElapsed();

      // Wait for requested block to finish loading
      // Note that if requestId is negative, endRequest will not be called.
      // Also note that getRequestedData must not be called with a negative value.
      loadTimer.reset();
      if (requestId < 0) {  // Data block was already in memory
        requestId = -requestId;
      }
      else {                // Data not ready yet, have to wait for it
        asynchDataAccess.waitForData( requestId );
      }
      asynchDataAccess.getRequestedData( requestId, dataInfo );
      if (dataInfo.errorFlag != SoVolumeData::LDMDataAccess::CORRECT) {
        printf( "*** ERROR loading block %d... error=%d\n"
                "    Block %d : %d %d %d -> %d %d %d\n",
				blocksLoaded, dataInfo.errorFlag, blocksLoaded, xmin, ymin, zmin, xmax, ymax, zmax);
      }

      // Update parameters of block we just loaded
      actDim = dataInfo.bufferDimension;
      blocksLoaded++;
      bytesLoaded += dataInfo.bufferSize;
      loadTime += loadTimer.getElapsed();

      // Swap the buffer pointers and go around again
      SoCpuBufferObject* temp = pDataBuffer;
      pDataBuffer = pLoadBuffer;
      pLoadBuffer = temp;
    } // End of inner loop (over Z tiles)
  }

  // Do computation on the final block of data
  compTimer.reset();
  doComputation( pDataBuffer, pDstBuffer.ptr(), actDim, dataType );
  compTime += compTimer.getElapsed();

  // Report timing results
  totalTime = totalTimer.getElapsed();
  {
    double valuesLoaded = (double)volDim[0] * volDim[1] * volDim[2];
    double BytesPerSec = (double)bytesLoaded / loadTime;
    double ValsPerSec  = valuesLoaded / loadTime;
    char buf1[80], buf2[80], buf3[80];
    formatNumberApproximate( double(numBytes)   , buf1 );
    formatNumberApproximate( BytesPerSec, buf2 );
    formatNumberApproximate( ValsPerSec , buf3 );
    fprintf(outputFILE, "Data loading: <%g> sec for %sB : %sB/sec (%s-Values/sec)\n",
      loadTime, buf1, buf2, buf3 );
    ValsPerSec = valuesLoaded / compTime;
    formatNumberApproximate( valuesLoaded, buf1 );
    formatNumberApproximate( ValsPerSec, buf2 );
    fprintf(outputFILE, "Computation : <%g> sec for %s-Values : %s-Values/sec\n",
      compTime, buf1, buf2 );
    ValsPerSec = valuesLoaded / totalTime;
    formatNumberApproximate( ValsPerSec, buf2 );
    fprintf(outputFILE, "Total time  : <%g> sec for %s-Values : %s-Values/sec\n",
      totalTime, buf1, buf2 );
  }

  // Cleanup
  pSrcBuffer0 = NULL;
  pSrcBuffer1 = NULL;
  pDstBuffer = NULL;

  pVolData->unref();
  SoVolumeRendering::finish();
  SoDB::finish();
  return 0;
}

////////////////////////////////////////////////////////////////////////
//
// Do the computation on one block of data
//
//   pSrcBuffer : input data
//   pDstBuffer : output data
//   actDim     : dimensions of data block
//   dataType   : type of data (int, float, etc)
//
// In this example we're just looking at the actual data loading,
// so the computation doesn't do anything.
// However we simulate some computation time in order to see the
// balance between data loading time and compute time.
// This will be more interesting when we overlap loading and
// computation using the asynchronous access methods.

int doComputation( void* /*pSrcBuffer*/, void* /*pDstBuffer*/,
                   SbVec3i32 actDim, SoVolumeData::DataType /*dataType*/ )
{
  // How many values in this block?
  float numValues = (float)actDim[0] * actDim[1] * actDim[2];

  // Simulate appropriate amount of computation
  float numSec = numValues / COMPUTE_PER_SEC;
  int msec = (int)(numSec * 1000);

  SbTime::sleep( msec );
  return 0;
}

////////////////////////////////////////////////////////////////////////
//
// Reformat big numbers for display
// Static local utility

char *
formatNumberApproximate( double _value, char *_buffer )
{
  const char *units = "";
  double value = _value;
  if (value >= 1.e9) {
    value /= 1.e9;
    units = "G";
  }
  else if (value >= 1.e6) {
    value /= 1.e6;
    units = "M";
  }
  else if (value >= 1.e3) {
    value /= 1.e3;
    units = "K";
  }
  sprintf( _buffer, "<%.2f> %s", value, units );
  return _buffer;
}


