diff --git a/bin/tadah_cli.cpp b/bin/tadah_cli.cpp
index deb213c28052e8c86359a6d5d52ac4ab1f9aac76..b1c2c76cb4240946963e0f5c5389a420b28420bd 100644
--- a/bin/tadah_cli.cpp
+++ b/bin/tadah_cli.cpp
@@ -32,6 +32,14 @@ extern "C" void descinit_(int*, int*, int*, int*, int*, int*, int*, int*, int*,
 extern "C" void pdpotrf_(char*, int*, double*, int*, int*, int*, int*);
 extern "C" void blacs_gridexit_(int*);
 extern "C" int numroc_(int*, int*, int*, int*, int*);
+
+// (transa, m, n, nrhs, a, ia, ja, desc_a, b, ib, jb, desc_b, work, lwork, info);
+extern "C" void	pdgels_(char* trans, int* m, int* n, int* nrhs, double* a, int* ia, int* ja, int* desca, 
+    double* b, int* ib, int* jb, int* descb, double* work, int* lwork, int* info);
+//extern "C" void pdgels_(char*, int*, int*, int*,double*,int*,int*,int*,double*,int*,int*,int*,double*, int*, int*);
+
+
+
 #endif
 
 void TadahCLI::subcommand_train() {
@@ -133,7 +141,7 @@ void TadahCLI::subcommand_train() {
   DC_Selector DCS(config);
 
   // must set DSIZE key as early as possible,
-  // this will allow us to querry for the number of columns later on
+  // this will allow us to query for the number of columns later on
   DescriptorsCalc<> dc(config,*DCS.d2b,*DCS.d3b,*DCS.dmb,
       *DCS.c2b,*DCS.c3b,*DCS.cmb);
 
@@ -166,6 +174,9 @@ void TadahCLI::subcommand_train() {
   std::cout << "PHI ROWS: " << PHI_rows << std::endl;
   std::cout << "PHI COLS: " << PHI_cols << std::endl;
 
+  // the number of rows must by divisable by the b_nrows
+  // the number of cols must by divisable by the b_ncols
+
   // Initialize BLACS
   int b_rank;
   int b_row;
@@ -173,37 +184,42 @@ void TadahCLI::subcommand_train() {
   int context;
   int b_nrows;   // Number of row procs
   int b_ncols;   // Number of column procs
-  int rnb = 22;   // Row block size TODO
-  int cnb = 22;   // Column block size TODO
+  int rnb = 1;   // Row block size TODO
+  int cnb = 1;   // Column block size TODO
 
   blacs_pinfo_(&b_rank, &ncpu) ; // BLACS rank and world size
-  std::cout << "RANK: " << rank << " BRANK: " << b_rank << " ncpu: " << ncpu << std::endl;
+  //std::cout << "RANK: " << rank << " BRANK: " << b_rank << " ncpu: " << ncpu << std::endl;
 
   // for now assume that the CPU grid is tall and narrow TODO
   // the required function must take into considersation:
   // the size of the blok rnb, number of cols and rows in Phi
   // ideally the grid should be as square as possible, unless PHI is narrow
   // Probably we want the number of column processes to be <= ~sqrt(ncpu)/cnb
+  //b_ncols = 1;
+  //b_nrows = ncpu;
   b_ncols = 1;
   b_nrows = ncpu;
   assert(b_nrows * b_ncols == ncpu);
 
-  std::cout << "b_ncols: " << b_ncols << " b_nrows: " << b_nrows << std::endl;
+  //std::cout << "b_ncols: " << b_ncols << " b_nrows: " << b_nrows << std::endl;
   int zero = 0;
   blacs_get_(&zero,&zero, &context ); // -> Create context
-  std::cout << "ZERO: " << zero << " CONTEXT: " << context << std::endl;
+  //std::cout << "ZERO: " << zero << " CONTEXT: " << context << std::endl;
   char layout='R'; // Block cyclic, Row major processor mapping
   blacs_gridinit_(&context, &layout, &b_nrows, &b_ncols ); // Context -> Initialize the grid
   blacs_gridinfo_(&context, &b_nrows, &b_ncols, &b_row, &b_col );
-  std::cout << "BRANK: " << b_rank << " BROW: " << b_row << " BCOL: " << b_col << std::endl;
+  //std::cout << "BRANK: " << b_rank << " BROW: " << b_row << " BCOL: " << b_col << std::endl;
+  printf( "rank = %d, nprow = %d, npcol = %d, myrow = %d, mycol = %d\n", rank, b_nrows, b_ncols, b_row, b_col );  
 
   // Compute the size of the local phi matrices
   int izero = 0;
+  int ione = 1;
   // int PHI_size = PHI_rows*PHI_cols;
   int phi_rows = numroc_( &PHI_rows, &rnb, &b_row, &izero, &b_nrows );
   int phi_cols = numroc_( &PHI_cols, &cnb, &b_col, &izero, &b_ncols );
   std::cout << "BRANK: " << b_rank << " phi_rows: " << phi_rows << " phi_col " << phi_cols << std::endl;
 
+  // COUNTERS
   size_t phi_row = 0; // next row to be filled in the local phi array
   int rows_available=phi_rows;  // number of available rows in the local phi array
 
@@ -212,11 +228,6 @@ void TadahCLI::subcommand_train() {
   // workers
   DesignMatrix<DM_Function_Base&> dm(*fb, config);
   dm.Phi.resize(phi_rows,phi_cols);
-  //Matrix phi;
-  //if (rank!=0) {
-  //  // allocate matrix for each worker
-  //  phi = Matrix(phi_rows,phi_cols);
-  //}
 
   // BEGIN HOST-WORKER
   if (rank==0) {
@@ -251,13 +262,11 @@ void TadahCLI::subcommand_train() {
       }
 
       // receive ANY request from ANY worker
-
       MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
       int worker = status.MPI_SOURCE;
       int tag = status.MPI_TAG;
       std::cout << "HOST1 request from WORKER: " << worker << " TAG: " << tag << std::endl; 
 
-
       if (tag==WORK_TAG) {
         int wsize; // array size for DATA_TAG, rows_available for WORK_TAG
         MPI_Recv (&wsize, 1, MPI_INT, worker, tag, MPI_COMM_WORLD, &status);
@@ -280,7 +289,6 @@ void TadahCLI::subcommand_train() {
         MPI_Send (&nstruc, 1, MPI_INT, worker, tag, MPI_COMM_WORLD);
 
         std::cout << "HOST: " << fn << " " << first << " " << nstruc << std::endl;
-        //}
     }
     else if (tag==DATA_TAG) {
       int rows_needed;
@@ -290,7 +298,6 @@ void TadahCLI::subcommand_train() {
         int rows_accepted = rows_available < rows_needed ? rows_available : rows_needed;
         MPI_Send (&b_rank, 1, MPI_INT, worker, tag, MPI_COMM_WORLD);
         MPI_Send (&rows_accepted, 1, MPI_INT, worker, tag, MPI_COMM_WORLD);
-        //MPI_Recv (&rows_accepted, 1, MPI_INT, worker, tag, MPI_COMM_WORLD, &status);
 
         // get data...
         MPI_Probe(worker, tag, MPI_COMM_WORLD, &status);
@@ -299,10 +306,8 @@ void TadahCLI::subcommand_train() {
         MPI_Get_count(&status, MPI_DOUBLE, &arr_size2);
         if (arr_size2!=arr_size) { throw std::runtime_error("arr_size != arr_size2\n");}
         MPI_Recv (&dm.Phi.data()[phi_row], arr_size, MPI_DOUBLE, worker, tag, MPI_COMM_WORLD, &status);
-
         rows_available -= rows_accepted;
         phi_row += rows_accepted;
-        //int start=phi_row*phi_cols;
         if (rows_available==0 ) { std::cout << "!!! HOST1 LOCAL MATRIX IS FILLED !!!" << std::endl;}
         if (rows_available<0 ) { throw std::runtime_error(" HOST1: The number of rows in the local array is smaller than requested.");}
 
@@ -547,11 +552,11 @@ else { // WORKER
           // we send data to the host or a willing worker
           //MPI_Send (&rows_accepted, 1, MPI_INT, dest, DATA_TAG, MPI_COMM_WORLD);
           //std::cout << "  FULL WORKER: " << b_rank << " send rows_accepted: " << rows_accepted << std::endl; 
-          rows_needed -= rows_accepted;
           //std::cout << "  FULL WORKER: " << b_rank << " rows_needed: " << rows_needed << std::endl; 
           int start=(temp_dm.Phi.rows()-rows_needed); // Phi is stored in column major order
           int arr_size = rows_accepted*phi_cols;
           MPI_Send (&temp_dm.Phi.data()[start], arr_size, MPI_DOUBLE, dest, DATA_TAG, MPI_COMM_WORLD);
+          rows_needed -= rows_accepted;
         }
 
       }
@@ -570,10 +575,71 @@ else { // WORKER
   }
   if (rows_available==0) std::cout << "+++++++ WORKER IS FULL++++++++  " << b_rank << "  ROWS AVAILABLE: " << rows_available << std::endl;
 
-  // All phi matrices are computed by this point
 }
 std::cout << "RANK" << rank << " HOST-WORKER EXIT SUCCESS" << std::endl;
 // END HOST-WORKER
+
+// All phi matrices are computed by this point
+
+// Descriptor for scalaPACK
+int descPHI[9];
+int descB[9];
+int info;
+int info2;
+descinit_( descPHI,  &PHI_rows, &PHI_cols, &rnb, &cnb, &izero, &izero, &context, /*leading dimension*/&phi_rows, &info);
+descinit_( descB  ,  &PHI_rows, &ione, &rnb, &cnb, &izero, &izero, &context, /*leading dimension*/&phi_rows, &info2);
+
+if (rank==1) {
+  std::cout << "descPHI:"<< std::endl;
+  for (int i=0;i<9;++i) {
+    std::cout << descPHI[i] << " ";
+  }
+  std::cout << std::endl;
+  std::cout << "descB:"<< std::endl;
+  for (int i=0;i<9;++i) {
+    std::cout << descB[i] << " ";
+  }
+  std::cout << std::endl;
+}
+if(info != 0) {
+  printf("Error in descinit, info = %d\n", info);
+}
+if(info2 != 0) {
+  printf("Error in descinit, info = %d\n", info2);
+}
+
+// Run dpotrf and time
+double MPIt1 = MPI_Wtime();
+char trans='N';
+int nrhs = 1;
+double *b = new double[phi_rows];
+for (int i=0;i<phi_rows;++i) b[i]=i;
+//if (rank==0)
+//  std::cout << dm.Phi;
+int ia = 1;
+int ja = 1;
+int ib = 1;
+int jb = 1;
+
+double wkopt;
+int lwork = -1; // query -> get size of the work matrix
+pdgels_(&trans, &phi_rows, &phi_cols, &nrhs, dm.Phi.ptr(), &ia, &ja, descPHI, b, &ib, &jb, descB, &wkopt, &lwork, &info);
+if (info != 0) {
+  printf("Error in potrf, info = %d\n", info);
+}
+
+lwork = (int)wkopt;
+
+std::cout << "work array size: "  << lwork << std::endl;
+
+double *work = new double[lwork];
+pdgels_(&trans, &phi_rows, &phi_cols, &nrhs, dm.Phi.ptr(), &ia, &ja, descPHI, b, &ib, &jb, descB, work, &lwork, &info);
+
+double MPIt2 = MPI_Wtime();
+printf("[%dx%d] Done, time %e s.\n", b_row, b_col, MPIt2 - MPIt1);
+
+delete[] b;
+delete[] work;
 #endif