From 0644cb627cd41c58b37a634f0de9f262161aa8bc Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 14:05:45 -0700
Subject: [PATCH 01/57] + Fixed bug in which an unknown file type is never
 detected

---
 src/tensor_read.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tensor_read.cc b/src/tensor_read.cc
index c7908bb..376b37c 100644
--- a/src/tensor_read.cc
+++ b/src/tensor_read.cc
@@ -339,6 +339,8 @@ detect_file_format(FILE *file)
   
   debug("detect_file_format(0x%x)\n", file);
   
+  format = file_format::unknown;
+  
   if (EOF != (c = peek(file))) {
     if ('%' == c) {
       format = file_format::mmio;

From 868e7e90a717f67b5410d6ea7dda142944a3ed84 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 14:06:51 -0700
Subject: [PATCH 02/57] + Added global thread count variable

---
 src/main.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/main.cc b/src/main.cc
index e74133a..1f10169 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -22,6 +22,7 @@ uint              cache_size;
 uint              cache_line_size;
 uint              iterations;
 uint              seed;
+uint              threads;
 char              *tool_name;
 tool::type_t      tool_type;
 bool              tracing;

From 0a7e82a815107aa34bcc9f20422f13cc12919289 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 14:07:55 -0700
Subject: [PATCH 03/57] + Changed matrix to be stored as a contiguous vector

---
 src/matrix_free.cc   | 6 +-----
 src/matrix_malloc.cc | 5 ++++-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/matrix_free.cc b/src/matrix_free.cc
index dff8863..f3ec950 100644
--- a/src/matrix_free.cc
+++ b/src/matrix_free.cc
@@ -11,8 +11,6 @@
 void
 matrix_free(matrix_t *matrix)
 {
-  uint i;
-  
   superfluous("matrix_free(matrix=0x%x)\n", matrix);
   
   if (!matrix) {
@@ -20,9 +18,7 @@ matrix_free(matrix_t *matrix)
   }
   
   if (ownership::creator == matrix->owner) {
-    for (i = 0; i < matrix->m; ++i) {
-      safe_free(matrix->data[i]);
-    }    
+    safe_free(matrix->data[0]);
     safe_free(matrix->data);
   }
   
diff --git a/src/matrix_malloc.cc b/src/matrix_malloc.cc
index c541477..f4e45b6 100644
--- a/src/matrix_malloc.cc
+++ b/src/matrix_malloc.cc
@@ -44,6 +44,7 @@ matrix_malloc(uint m, uint n, ownership::type_t owner)
 {
   uint     i;
   matrix_t *mr;
+  double   *p;
   
   superfluous("matrix_malloc(m=%d, n=%d, owner='%s')\n", m, n, ownership_to_string(owner));
   
@@ -59,8 +60,10 @@ matrix_malloc(uint m, uint n, ownership::type_t owner)
   }
   
   mr->data = MALLOC_N(double*, m);
+  p        = MALLOC_N(double, m*n);
   for (i = 0; i < m; ++i) {
-    mr->data[i] = MALLOC_N(double, n);
+    mr->data[i] = p;
+    p += n;
   }
   
   return mr;

From e0c37d237dcad6a64fa03c919f17d5a4c76ab65a Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 14:08:42 -0700
Subject: [PATCH 04/57] + Added command line support for thread count to use
 during execution

---
 src/tool.h             |  1 +
 src/tool_effectuate.cc | 12 ++++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/tool.h b/src/tool.h
index 01a4fd4..851792d 100644
--- a/src/tool.h
+++ b/src/tool.h
@@ -30,6 +30,7 @@ namespace tool {
 #define DEFAULT_SIMULATE              false
 #define DEFAULT_STRATEGY              strategy::compressed
 #define DEFAULT_TRACING               false
+#define DEFAULT_THREAD_COUNT          1
 #define DEFAULT_VERBOSE               false
 #define DEFAULT_VERBOSITY             verbosity::low
 #define DEFAULT_WRITE_RESULTS         false
diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc
index 68e0714..fc339f3 100644
--- a/src/tool_effectuate.cc
+++ b/src/tool_effectuate.cc
@@ -23,6 +23,7 @@ extern uint              cache_size;
 extern uint              cache_line_size;
 extern uint              iterations;
 extern bool              human_readable;
+extern uint              threads;
 extern char              *tool_name;
 extern tool::type_t      tool_type;
 extern bool              simulate;
@@ -51,7 +52,8 @@ effectuate_tool_usage()
 #if !defined (NOSIMULATE)
   message("\t-s\tsimulate cache (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_SIMULATE));
 #endif
-  message("\t-t\ttoggle tracing (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_TRACING));
+  message("\t-t\tnumer of threads to use (default: %d)\n", DEFAULT_THREAD_COUNT);
+  message("\t-T\ttoggle tracing (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_TRACING));
   message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE));
   message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max);
   message("\t-w\twrite results (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_WRITE_RESULTS));
@@ -183,7 +185,7 @@ effectuate_tool_main(int argc, char *argv[])
   opterr = 0;
   
   /* extract any command-line options the user provided */
-  while (-1 != (c = getopt(argc, argv, ":hl:m:n:o:p:stuvV:w"))) {
+  while (-1 != (c = getopt(argc, argv, ":hl:m:n:o:p:st:TuvV:w"))) {
     switch (c) {
     case 'h': 
       effectuate_tool_usage();
@@ -217,6 +219,12 @@ effectuate_tool_main(int argc, char *argv[])
       simulate = !simulate;
       break;
     case 't':
+      threads = atoi(optarg);
+      if (0 == threads) {
+	threads = DEFAULT_THREAD_COUNT;
+      }
+      break;
+    case 'T':
       tracing = !tracing;
       break;
     case 'u':

From f03f7c3c6ebfc0248708e9faeb3c9926e23bd55f Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 14:20:44 -0700
Subject: [PATCH 05/57] + Removed code that drops zero entries

---
 src/tensor_write.cc | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/tensor_write.cc b/src/tensor_write.cc
index 42fc5a2..8b86c97 100644
--- a/src/tensor_write.cc
+++ b/src/tensor_write.cc
@@ -36,21 +36,6 @@ tensor_fwrite_coordinate(FILE *file, tensor_t const *tensor)
     die("Could not write Tensor Market banner (%d).\n", result);
   }
   
-#if 0
-  storage = STORAGE_COORIDINATE(tensor);
-  tuples  = storage->tuples;
-  nnz     = 0;
-  
-  for (i = 0; i < tensor->nnz; ++i) {
-    if (!might_as_well_be_zero(tensor->values[i])) {
-      nnz++;
-    }
-  }
-  
-  debug("tensor_write_coordinate: non-zero values: implied=%d, actual=%d.\n", tensor->nnz, nnz);
-  debug("tensor_write_coordinate: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n);
-#endif
-  
   debug("tensor_write_coordinate: non-zero values: actual=%d.\n", tensor->nnz);
   debug("tensor_write_coordinate: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n);
   

From 5f7961604bd3fd937d006aa4bcd7a8dd322eb657 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 15:21:41 -0700
Subject: [PATCH 06/57] + Since we support array tensors, we need to enable
 support in the lower level functions that do sanity checks for us

---
 src/compatible.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/compatible.cc b/src/compatible.cc
index 1f9d1ac..1a14ecb 100644
--- a/src/compatible.cc
+++ b/src/compatible.cc
@@ -11,6 +11,7 @@ compatible(vector_t const *lhs, tensor_t const *rhs)
   debug("compatible(vector=0x%x, tensor=0x%x)\n", lhs, rhs);
   
   switch (rhs->strategy) {
+  case strategy::array:
   case strategy::compressed:
   case strategy::slice:
   case strategy::ekmr:
@@ -23,7 +24,7 @@ compatible(vector_t const *lhs, tensor_t const *rhs)
   }
   
   if (!supported) {
-    die("Tensor strategy '%s' is not currently supported.\n",
+    die("compatible: tensor strategy '%s' is not currently supported.\n",
 	strategy_to_string(rhs->strategy));
   }
   

From 39ee85e40e8b6eacf6457672aa680c4d560f4d3b Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 15:25:13 -0700
Subject: [PATCH 07/57] + All tensor/vector combinations work, so long as the
 tensor's tube is the correct length

---
 src/compatible.cc | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/compatible.cc b/src/compatible.cc
index 1a14ecb..2f7ddfe 100644
--- a/src/compatible.cc
+++ b/src/compatible.cc
@@ -28,24 +28,12 @@ compatible(vector_t const *lhs, tensor_t const *rhs)
 	strategy_to_string(rhs->strategy));
   }
   
-  switch (rhs->orientation) {
-  case orientation::row:
-  case orientation::column:
-  case orientation::tube:
-  case orientation::lateral:
-  case orientation::horizontal:
-  case orientation::frontal:
-    compatible = (lhs->n == rhs->l);
-    break;
-  default:
-    compatible = false;
-    break;
-  }
+  compatible = (lhs->n == rhs->l);
   
   if (!compatible) {
     print_information(lhs);
     print_information(rhs);
-    die("Tensors and vector do not have matching dimensions.\n");
+    die("Tensor and vector do not have matching dimensions.\n");
   }
 }
 

From 736e3fbc4c681ba4b3af27eb534f3780025a2580 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 17:56:18 -0700
Subject: [PATCH 08/57] + Added threding code in to makefile

---
 src/Makefile | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 675cf1f..6265660 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -16,7 +16,7 @@ ifndef SIMULATE
 	EXTRA_DEBUG += -DNOSIMULATE
 endif
 EXTRA_CXXFLAGS=-c $(EXTRA_DEBUG) $(STRICT) $(INCLUDES) $(CPPX11)
-EXTRA_LDFLAGS=-Wall $(EXTRA_DEBUG)
+EXTRA_LDFLAGS=-Wall -lpthread $(EXTRA_DEBUG)
 
 HEADERS_CACHE=address.h cache.h hash.h
 HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h	\
@@ -32,10 +32,11 @@ HEADERS=$(HEADERS_CACHE) $(HEADERS_GENERAL) $(HEADERS_GENERATE)	\
 SOURCES_CACHE=address.cc cache.cc hash.cc
 SOURCES_GENERAL=arithmetic.cc compatible.cc error.cc file.cc		\
 	information.cc latex.cc memory.cc mmio.cc			\
-	operation_n_mode_product.cc operation_utility.cc random.cc	\
-	strings.cc timer.cc tool_convert.cc tool_effectuate.cc		\
-	tool_generate.cc tool_permute.cc tool_timing.cc			\
-	tool_utility.cc types.cc utility.cc
+	operation_n_mode_product.cc					\
+	operation_threaded_n_mode_product.cc operation_utility.cc	\
+	random.cc strings.cc timer.cc tool_convert.cc			\
+	tool_effectuate.cc tool_generate.cc tool_permute.cc		\
+	tool_timing.cc tool_utility.cc types.cc utility.cc
 SOURCES_GENERATE=generate_tensor_from_matrix.cc
 SOURCES_MATRIX=matrix_arithmetic.cc matrix_clear.cc			\
 	matrix_compatible.cc matrix_copy.cc matrix_free.cc		\

From 8618749008a73fb536ad4ef9e0c0c8a99c78e374 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 17:58:47 -0700
Subject: [PATCH 09/57] + Added threading support for n-mode product + Global
 threads => thread_cound

---
 src/main.cc                     |  2 +-
 src/operation_n_mode_product.cc | 24 ++++++++++++++++++++----
 src/tool_effectuate.cc          | 10 +++++-----
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/main.cc b/src/main.cc
index 1f10169..2b893fa 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -22,7 +22,7 @@ uint              cache_size;
 uint              cache_line_size;
 uint              iterations;
 uint              seed;
-uint              threads;
+uint              thread_count;
 char              *tool_name;
 tool::type_t      tool_type;
 bool              tracing;
diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index 8d60446..ac0ee24 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -10,6 +10,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+extern cache_t *cache;
+extern uint    thread_count;
+
 /*
   Computing ($pT$):
   Let $\T \in R^{n\times n\times n}$ be a tensor.
@@ -24,8 +27,6 @@
   end for
 */
 
-extern cache_t *cache;
-
 void
 compressed_row(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
@@ -412,9 +413,9 @@ n_mode_product_ekmr(matrix_t *matrix, vector_t const *vector, tensor_t const *te
 }
 
 void
-operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
-  debug("operation_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
+  debug("serial_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
   
   compatible(vector, tensor);
   
@@ -438,6 +439,21 @@ operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t cons
   }
 }
 
+extern void 
+threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor);
+
+void
+operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+{
+  debug("operation_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
+  
+  if (1 == thread_count) {
+    serial_n_mode_product(matrix, vector, tensor);
+  } else {
+    threaded_n_mode_product(matrix, vector, tensor);
+  }
+}
+
 matrix_t*
 operation_n_mode_product(vector_t const *vector, tensor_t const *tensor)
 {
diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc
index fc339f3..b16ce94 100644
--- a/src/tool_effectuate.cc
+++ b/src/tool_effectuate.cc
@@ -23,7 +23,7 @@ extern uint              cache_size;
 extern uint              cache_line_size;
 extern uint              iterations;
 extern bool              human_readable;
-extern uint              threads;
+extern uint              thread_count;
 extern char              *tool_name;
 extern tool::type_t      tool_type;
 extern bool              simulate;
@@ -52,7 +52,7 @@ effectuate_tool_usage()
 #if !defined (NOSIMULATE)
   message("\t-s\tsimulate cache (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_SIMULATE));
 #endif
-  message("\t-t\tnumer of threads to use (default: %d)\n", DEFAULT_THREAD_COUNT);
+  message("\t-t\tnumer of thread_count to use (default: %d)\n", DEFAULT_THREAD_COUNT);
   message("\t-T\ttoggle tracing (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_TRACING));
   message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE));
   message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max);
@@ -219,9 +219,9 @@ effectuate_tool_main(int argc, char *argv[])
       simulate = !simulate;
       break;
     case 't':
-      threads = atoi(optarg);
-      if (0 == threads) {
-	threads = DEFAULT_THREAD_COUNT;
+      thread_count = atoi(optarg);
+      if (0 == thread_count) {
+	thread_count = DEFAULT_THREAD_COUNT;
       }
       break;
     case 'T':

From 2f90d1ceeda297f29fe2e48d197972efdf50c52e Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 17:59:52 -0700
Subject: [PATCH 10/57] + Array tensors now read and write array formats rather
 than coordinate formats

---
 src/tensor_read.cc  | 38 +++++++++++++++-----------------------
 src/tensor_write.cc | 43 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/src/tensor_read.cc b/src/tensor_read.cc
index 376b37c..0fcb404 100644
--- a/src/tensor_read.cc
+++ b/src/tensor_read.cc
@@ -10,13 +10,11 @@
 tensor_t*
 tensor_fread_array(FILE *file)
 {
-  int                  i, j, k, v;
-  int                  l, m, n, nnz;
-  int                  result;
-  double               d;
+  int                  i, j, k, index;
+  int                  l, m, n;
+  int                  line, result;
   tensor_t             *tensor;
-  tensor_storage_coordinate_t *storage;
-  coordinate_tuple_t   *tuples;
+  double               *T;
   
   debug("tensor_fread_array(0x%x)\n", file);
   
@@ -26,24 +24,18 @@ tensor_fread_array(FILE *file)
   
   debug("tensor_fread_array: l=%d, m=%d, n=%d\n", l, m, n);
   
-  nnz     = l*m*n;
-  tensor  = tensor_malloc(l, m, n, nnz, strategy::coordinate);
-  storage = STORAGE_COORIDINATE(tensor);
-  tuples  = storage->tuples;
-  v       = 0;
-  
-  for (k = 0; k < l; ++k) {
-    for (i = 0; i < m; ++i) {
-      for (j = 0; j < n; ++j) {
-	if (1 != (result = fscanf(file, "%lg\n", &d))) {
-	  die("Failed to process line %d of the input stream (%d).\n", v, result);
+  tensor  = tensor_malloc(l, m, n);
+  T       = tensor->values;
+  line    = 0;
+    
+  for (i = 0; i < m; ++i) {
+    for (j = 0; j < n; ++j) {
+      for (k = 0; k < l; ++k) {
+	index = tensor_index(tensor, i, j, k);
+	if (1 != (result = fscanf(file, "%lg\n", &T[index]))) {
+	  die("Failed to process line %d of the input stream (%d).\n", line, result);
 	}
-	tensor->values[v] = d;
-	tuples[v].i       = i;
-	tuples[v].j       = j;
-	tuples[v].k       = k;
-	tuples[v].index   = v;
-	v++;
+	line++;
       }
     }
   }
diff --git a/src/tensor_write.cc b/src/tensor_write.cc
index 8b86c97..8462f7c 100644
--- a/src/tensor_write.cc
+++ b/src/tensor_write.cc
@@ -19,6 +19,38 @@ tensor_initialize_typecode(MM_typecode *type, strategy::type_t strategy)
   mm_set_real(type);
 }
 
+void
+tensor_fwrite_array(FILE *file, tensor_t const *tensor)
+{
+  int         i, j, k;
+  int         l, m, n;
+  int         result;
+  MM_typecode type;
+  double      ***T;
+  
+  debug("tensor_write_array(file=0x%x, tensor=0x%x)\n", file, tensor);
+  
+  tensor_initialize_typecode(&type, strategy::array);
+  
+  if (0 != (result = mm_write_banner(file, type))) {
+    die("Could not write Tensor Market banner (%d).\n", result);
+  }
+  
+  debug("tensor_write_array: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n);
+  
+  if (0 != (result = mm_write_tensor_array_size(file, tensor->l, tensor->m, tensor->n))) {
+    die("Failed to write array tensor of size %d x %d x %d (%d).\n", tensor->l, tensor->m, tensor->n, result);
+  }
+  
+  for (i = 0; i < m; ++i) {
+    for (j = 0; j < n; ++j) {
+      for (k = 0; k < l; ++k) {
+	fprintf(file, "%10.6g\n", T[i][j][k]);
+      }
+    }
+  }
+}
+
 void
 tensor_fwrite_coordinate(FILE *file, tensor_t const *tensor)
 {
@@ -47,7 +79,7 @@ tensor_fwrite_coordinate(FILE *file, tensor_t const *tensor)
   tuples  = storage->tuples;
   
   for (i = 0; i < tensor->nnz; ++i) {
-    fprintf(file, "%d %d %d %10.32g\n", tuples[i].k, tuples[i].i, tuples[i].j, tensor->values[tuples[i].index]);
+    fprintf(file, "%d %d %d %10.6g\n", tuples[i].k, tuples[i].i, tuples[i].j, tensor->values[tuples[i].index]);
   }
 }
 
@@ -95,7 +127,7 @@ tensor_fwrite_compressed(FILE *file, tensor_t const *tensor)
   }
   
   for (i = 0; i < nnz; ++i) {
-    fprintf(file, "%d %d %10.32g\n", storage->CO[i], storage->KO[i], tensor->values[i]);
+    fprintf(file, "%d %d %10.6g\n", storage->CO[i], storage->KO[i], tensor->values[i]);
   }
 }
 
@@ -145,7 +177,7 @@ tensor_fwrite_compressed_slice(FILE *file, tensor_t const *tensor)
   }
   
   for (i = 0; i < nnz; ++i) {
-    fprintf(file, "%d %10.32g\n", storage->KO[i], tensor->values[i]);
+    fprintf(file, "%d %10.6g\n", storage->KO[i], tensor->values[i]);
   }
 }
 
@@ -195,7 +227,7 @@ tensor_fwrite_extended_compressed(FILE *file, tensor_t const *tensor)
   }
   
   for (i = 0; i < nnz; ++i) {
-    fprintf(file, "%d %10.32g\n", storage->CK[i], tensor->values[i]);
+    fprintf(file, "%d %10.6g\n", storage->CK[i], tensor->values[i]);
   }
 }
 
@@ -206,6 +238,9 @@ tensor_fwrite_implementation(FILE *file, tensor_t const *tensor)
   debug("tensor_fwrite_implementation: strategy='%s'\n", strategy_to_string(tensor->strategy));
   
   switch (tensor->strategy) {
+  case strategy::array:
+    tensor_fwrite_array(file, tensor);
+    break;
   case strategy::coordinate:
     tensor_fwrite_coordinate(file, tensor);
     break;

From cc67ca542f18112b7a9bd37a544d5366edadbf85 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 18:01:00 -0700
Subject: [PATCH 11/57] + Added a simple inline function to calculate offsets
 in to a tensor (when stored as vector)

---
 src/tensor.h         | 1 +
 src/tensor_malloc.cc | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/tensor.h b/src/tensor.h
index d3c14ca..378eb54 100644
--- a/src/tensor.h
+++ b/src/tensor.h
@@ -112,6 +112,7 @@ tensor_t* tensor_malloc(uint l, uint m, uint n, uint nnz, strategy::type_t strat
 			orientation::type_t orientation = orientation::unknown,
 			ownership::type_t owner = ownership::creator);
 tensor_t* tensor_malloc_from_template(tensor_t const *tensor);
+uint tensor_index(tensor_t const *tensor, uint i, uint j, uint k);
 void tensor_free(tensor_t *tensor);
 
 tensor_t* tensor_copy_shallow(tensor_t *source);
diff --git a/src/tensor_malloc.cc b/src/tensor_malloc.cc
index f55f93d..ddce7ce 100644
--- a/src/tensor_malloc.cc
+++ b/src/tensor_malloc.cc
@@ -8,6 +8,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+uint
+tensor_index(tensor_t const *tensor, uint i, uint j, uint k) {
+  return (i*tensor->n*tensor->m) + (j*tensor->m) + k;
+}
+
 tensor_t*
 tensor_malloc(uint l, uint m, uint n, ownership::type_t owner)
 {
@@ -30,8 +35,8 @@ tensor_malloc(uint l, uint m, uint n, ownership::type_t owner)
     return tensor;
   }
   
-  tensor->values     = MALLOC_N(double, l*m*n);
-  
+  tensor->values      = MALLOC_N(double, l*m*n);
+    
   superfluous("tensor_malloc: tensor->values=0x%x\n", tensor->values);
   superfluous("tensor_malloc: tensor=0x%x\n", tensor);
 

From 134006df19194caf61395fa0201b0ed449c7ca1c Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 18:02:57 -0700
Subject: [PATCH 12/57] + Serial and threaded n-mode product calculation for
 dense array tensors

---
 src/operation_threaded_n_mode_product.cc | 157 +++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 src/operation_threaded_n_mode_product.cc

diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc
new file mode 100644
index 0000000..2ebe875
--- /dev/null
+++ b/src/operation_threaded_n_mode_product.cc
@@ -0,0 +1,157 @@
+
+#include "cache.h"
+#include "compatible.h"
+#include "error.h"
+#include "matrix.h"
+#include "operation.h"
+#include "tensor.h"
+#include "utility.h"
+#include "vector.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+extern cache_t *cache;
+extern uint    thread_count;
+
+/*
+  Computing ($pT$):
+  Let $\T \in R^{n\times n\times n}$ be a tensor.
+  Let $\M \in R^{n\times n}$ be a matrix.
+  Let $p \in R^{n}$ be a vector.
+  for i = 1 to l do
+    for j = 1 to m do 
+      for k = 1 to m do
+        M[i][j] += p[k] * T[i][j][k]
+      end for
+    end for
+  end for
+*/
+
+typedef struct {
+  uint           done;
+  matrix_t       *matrix;
+  vector_t const *vector;
+  tensor_t const *tensor;
+} product_thread_data_t;
+
+static pthread_mutex_t tube_lock;
+
+int
+next_tube(product_thread_data_t *data)
+{
+  uint k;
+  
+  pthread_mutex_lock(&tube_lock);
+  k = data->done++;
+  pthread_mutex_unlock(&tube_lock);
+  return k < (data->tensor->m*data->tensor->n) ? k : -1;
+}
+
+void*
+fiber_product(void *arg)
+{
+  int                   k;
+  product_thread_data_t *data;
+  uint   i, j, k, index;
+  uint   m, n, l;
+  uint   *P;
+  double **M, *T;
+  
+  data = (product_thread_data_t*) arg;
+  
+  M = matrix->data;
+  P = vector->data;
+  T = tensor->values;
+  
+  l = tensor->l;
+  m = tensor->m;
+  n = tensor->n;
+  
+  while (-1 != (k = next_tube(data))) {
+    offset = k*data->tensor->l;
+    for (i = 0; i < l; ++i) {
+      T[offset+i];
+      // M[i][j] += P[k] * T[index];
+    }
+  }
+  
+  return NULL;
+}
+
+void
+threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+{
+  uint                  i;
+  pthread_t             threads[32];
+  int                   error;
+  int                   *status;
+  product_thread_data_t data;
+  
+  data.done   = 0;
+  data.matrix = matrix;
+  data.vector = vector;
+  data.tensor = tensor;
+  
+  pthread_mutex_init(&tube_lock, NULL);
+
+  for (i = 0; i < thread_count; ++i) {
+    if (0 != (error = pthread_create(&threads[i], NULL, fiber_product, &data))) {
+      die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error);
+    }
+  }
+  
+  for (i = 0; i < thread_count; ++i) {
+    if (0 != (error = pthread_join(threads[i], NULL))) {
+      die("threaded_n_mode_product_array: pthread_join() failed with %d\n", error);
+    }
+  }
+  
+  pthread_mutex_destroy(&tube_lock);
+}
+
+void
+n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+{
+  uint   i, j, k, index;
+  uint   m, n, l;
+  uint   *P;
+  double **M, *T;
+  
+  debug("n_mode_product_array(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
+  
+  M = matrix->data;
+  P = vector->data;
+  T = tensor->values;
+  
+  l = tensor->l;
+  m = tensor->m;
+  n = tensor->n;
+  
+  for (i = 0; i < m; ++i) {
+    for (j = 0; j < n; ++j) {
+      for (k = 0; k < l; ++k) {
+	index = tensor_index(tensor, i, j, k);
+	M[i][j] += P[k] * T[index];
+      }
+    }
+  }
+}
+
+void
+threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+{
+  debug("threaded_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
+  
+  compatible(vector, tensor);
+  
+  switch (tensor->strategy) {
+  case strategy::array:
+    n_mode_product_array(matrix, vector, tensor);
+    break;
+  default:
+    die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n",
+	strategy_to_string(tensor->strategy));
+    break;
+  }
+}

From 01cbeb405f9157d515443698f5d9da2b336f4000 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 18:17:16 -0700
Subject: [PATCH 13/57] + Sample dense data

---
 results/dense3.in | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 results/dense3.in

diff --git a/results/dense3.in b/results/dense3.in
new file mode 100644
index 0000000..23bcbd8
--- /dev/null
+++ b/results/dense3.in
@@ -0,0 +1,29 @@
+%%MatrixMarket tensor array      real    general            
+3 3 3
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27

From 3fcb1d2b743ddf9c17114b5418a38ca12a22e804 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 18:39:44 -0700
Subject: [PATCH 14/57] + Fixed typo in matrix file format enum: there was on
 'unknown' entry

---
 src/matrix.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/matrix.h b/src/matrix.h
index a8b32a8..e4a9693 100644
--- a/src/matrix.h
+++ b/src/matrix.h
@@ -7,7 +7,7 @@
 
 namespace format {
   typedef enum {
-    format,
+    unknown,
     array,
     coordinate
   } type_t;

From e54654c40c4e3fc363187accb5c9e4ac1c212262 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 18:40:38 -0700
Subject: [PATCH 15/57] + Added debugging output for matrix writing code +
 Decreased the number of decimal places printed to a file

---
 src/matrix_write.cc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/matrix_write.cc b/src/matrix_write.cc
index 4f2b3cd..d8cc7f2 100644
--- a/src/matrix_write.cc
+++ b/src/matrix_write.cc
@@ -23,6 +23,8 @@ matrix_fwrite_array(FILE *file, matrix_t const *matrix)
   int         result;
   MM_typecode type;
   
+  debug("matrix_fwrite_array(file=0x%x, matrix=0x%x)\n", file, matrix);
+  
   matrix_initialize_type(&type);
   mm_set_array(&type);
   
@@ -33,10 +35,10 @@ matrix_fwrite_array(FILE *file, matrix_t const *matrix)
   if (0 != (result = mm_write_matrix_array_size(file, matrix->m, matrix->n))) {
     die("Failed to write matrix array size (%d).\n", result);
   }
-
+  
   for (i = 0; i < matrix->m; ++i) {
     for (j = 0; j < matrix->n; ++j) {
-      fprintf(file, "%10.32g\n", matrix->data[i][j]);
+      fprintf(file, "%10.6g\n", matrix->data[i][j]);
     }
   }
 }
@@ -48,6 +50,8 @@ matrix_fwrite_coordinate(FILE *file, matrix_t const *matrix)
   int         nnz, result;
   MM_typecode type;
   
+  debug("matrix_fwrite_coordinate(file=0x%x, matrix=0x%x)\n", file, matrix);
+  
   matrix_initialize_type(&type);
   mm_set_coordinate(&type);
   
@@ -71,7 +75,7 @@ matrix_fwrite_coordinate(FILE *file, matrix_t const *matrix)
   for (i = 0; i < matrix->m; ++i) {
     for (j = 0; j < matrix->n; ++j) {
       if (!might_as_well_be_zero(matrix->data[i][j])) {
-	fprintf(file, "%d %d %10.32g\n", i+1, j+1, matrix->data[i][j]);
+	fprintf(file, "%d %d %10.6g\n", i+1, j+1, matrix->data[i][j]);
       }
     }
   }
@@ -80,6 +84,8 @@ matrix_fwrite_coordinate(FILE *file, matrix_t const *matrix)
 void
 matrix_fwrite(FILE *file, matrix_t const *matrix, format::type_t format)
 {
+  debug("matrix_fwrite(file=0x%x, matrix=0x%x)\n", file, matrix);
+  
   if (format::coordinate == format) {
     matrix_fwrite_coordinate(file, matrix);
   } else {
@@ -91,7 +97,9 @@ void
 matrix_write(char const *filename, matrix_t const *matrix, format::type_t format)
 {
   FILE *file;
-
+  
+  debug("matrix_write(0x%x)\n", file);
+  
   file = fopen_or_die(filename, "w+");
   matrix_fwrite(file, matrix, format);
   fclose(file);

From c16a4cf340b5b565976330f4a363968ab8d2a970 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 18:41:28 -0700
Subject: [PATCH 16/57] + Fixed typos in debug output statements

---
 src/tensor_write.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/tensor_write.cc b/src/tensor_write.cc
index 8462f7c..8f69149 100644
--- a/src/tensor_write.cc
+++ b/src/tensor_write.cc
@@ -28,7 +28,7 @@ tensor_fwrite_array(FILE *file, tensor_t const *tensor)
   MM_typecode type;
   double      ***T;
   
-  debug("tensor_write_array(file=0x%x, tensor=0x%x)\n", file, tensor);
+  debug("tensor_fwrite_array(file=0x%x, tensor=0x%x)\n", file, tensor);
   
   tensor_initialize_typecode(&type, strategy::array);
   
@@ -36,7 +36,7 @@ tensor_fwrite_array(FILE *file, tensor_t const *tensor)
     die("Could not write Tensor Market banner (%d).\n", result);
   }
   
-  debug("tensor_write_array: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n);
+  debug("tensor_fwrite_array: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n);
   
   if (0 != (result = mm_write_tensor_array_size(file, tensor->l, tensor->m, tensor->n))) {
     die("Failed to write array tensor of size %d x %d x %d (%d).\n", tensor->l, tensor->m, tensor->n, result);
@@ -60,7 +60,7 @@ tensor_fwrite_coordinate(FILE *file, tensor_t const *tensor)
   tensor_storage_coordinate_t *storage;
   coordinate_tuple_t          *tuples;
   
-  debug("tensor_write_coordinate(file=0x%x, tensor=0x%x)\n", file, tensor);
+  debug("tensor_fwrite_coordinate(file=0x%x, tensor=0x%x)\n", file, tensor);
   
   tensor_initialize_typecode(&type, strategy::coordinate);
   
@@ -68,8 +68,8 @@ tensor_fwrite_coordinate(FILE *file, tensor_t const *tensor)
     die("Could not write Tensor Market banner (%d).\n", result);
   }
   
-  debug("tensor_write_coordinate: non-zero values: actual=%d.\n", tensor->nnz);
-  debug("tensor_write_coordinate: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n);
+  debug("tensor_fwrite_coordinate: non-zero values: actual=%d.\n", tensor->nnz);
+  debug("tensor_fwrite_coordinate: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n);
   
   if (0 != (result = mm_write_tensor_coordinate_size(file, tensor->l, tensor->m, tensor->n, tensor->nnz))) {
     die("Failed to write coordinate tensor of size %d (%d).\n", nnz, result);

From 628f29a942cf2d50d2e0e939c7ca9e4f896cbba7 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 18:42:07 -0700
Subject: [PATCH 17/57] + Set correct default for the number of threads to use
 whist running in parallel

---
 src/tool_effectuate.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc
index b16ce94..857c0cc 100644
--- a/src/tool_effectuate.cc
+++ b/src/tool_effectuate.cc
@@ -83,7 +83,7 @@ timed_matrix_write(int argc, char *argv[], int const offset, matrix_t const *mat
   }
   
   timer_start(&t);
-  matrix_fwrite(file, matrix, format::coordinate);
+  matrix_fwrite(file, matrix, format::array);
   timer_end(&t);
   print_elapsed_time(t);
   
@@ -179,7 +179,8 @@ effectuate_tool_main(int argc, char *argv[])
   int c;
   
   /* set the program's defaults */
-  optcode   = DEFAULT_OPERATION;
+  optcode      = DEFAULT_OPERATION;
+  thread_count = DEFAULT_THREAD_COUNT;
   
   /* we will privide our own error messages */
   opterr = 0;
@@ -266,6 +267,7 @@ effectuate_tool_main(int argc, char *argv[])
   /* print program options, for debugging purposes */
   print_tool_options();
   debug("effectuate_tool_main: operation='%s'\n", operation_to_string(optcode));
+  debug("effectuate_tool_main: thread_count=%d\n", thread_count);
   
   /* if we are just running a simulation, then we only do one
      iteration; otherwise, it would be really slow */

From 32f53338f827826651205296747c5f786e98cc74 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 18:43:15 -0700
Subject: [PATCH 18/57] + Fixed a small typo to check for the number of threads
 to use during the n-mode product calculation

---
 src/operation_n_mode_product.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index ac0ee24..9805e6b 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -412,6 +412,9 @@ n_mode_product_ekmr(matrix_t *matrix, vector_t const *vector, tensor_t const *te
   }
 }
 
+extern void
+n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor);
+
 void
 serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
@@ -420,6 +423,9 @@ serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *
   compatible(vector, tensor);
   
   switch (tensor->strategy) {
+  case strategy::array:
+    n_mode_product_array(matrix, vector, tensor);
+    break;
   case strategy::compressed:
     n_mode_product_compressed(matrix, vector, tensor);
     break;
@@ -447,7 +453,7 @@ operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t cons
 {
   debug("operation_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
   
-  if (1 == thread_count) {
+  if (thread_count <= 1) {
     serial_n_mode_product(matrix, vector, tensor);
   } else {
     threaded_n_mode_product(matrix, vector, tensor);

From 0104201f8d856a225815d009e7129bcd4b69f71d Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 21:12:56 -0700
Subject: [PATCH 19/57] + First multi-threaded version working

---
 src/operation_threaded_n_mode_product.cc | 65 +++++++++++-------------
 1 file changed, 30 insertions(+), 35 deletions(-)

diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc
index 2ebe875..b0a411d 100644
--- a/src/operation_threaded_n_mode_product.cc
+++ b/src/operation_threaded_n_mode_product.cc
@@ -38,41 +38,40 @@ typedef struct {
 static pthread_mutex_t tube_lock;
 
 int
-next_tube(product_thread_data_t *data)
+next_tube(product_thread_data_t *p)
 {
   uint k;
   
   pthread_mutex_lock(&tube_lock);
-  k = data->done++;
+  k = p->done++;
   pthread_mutex_unlock(&tube_lock);
-  return k < (data->tensor->m*data->tensor->n) ? k : -1;
+  return k < (p->tensor->n*p->tensor->n) ? k : -1;
 }
 
 void*
 fiber_product(void *arg)
 {
-  int                   k;
-  product_thread_data_t *data;
-  uint   i, j, k, index;
-  uint   m, n, l;
-  uint   *P;
-  double **M, *T;
-  
-  data = (product_thread_data_t*) arg;
-  
-  M = matrix->data;
-  P = vector->data;
-  T = tensor->values;
-  
-  l = tensor->l;
-  m = tensor->m;
-  n = tensor->n;
-  
-  while (-1 != (k = next_tube(data))) {
-    offset = k*data->tensor->l;
-    for (i = 0; i < l; ++i) {
-      T[offset+i];
-      // M[i][j] += P[k] * T[index];
+  int                   t;
+  uint                  i, j, k, offset;
+  uint                  n;
+  uint                  *P;
+  double                **M, *T;
+  product_thread_data_t *p;
+  
+  p = (product_thread_data_t*) arg;
+  
+  M = p->matrix->data;
+  P = p->vector->data;
+  T = p->tensor->values;
+  
+  n = p->tensor->n;
+  
+  while (-1 != (t = next_tube(p))) {
+    offset = t*n;
+    i      = t/n;
+    j      = t%n;
+    for (k = 0; k < n; ++k) {
+      M[i][j] += P[k] * T[offset+k];
     }
   }
   
@@ -85,7 +84,6 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
   uint                  i;
   pthread_t             threads[32];
   int                   error;
-  int                   *status;
   product_thread_data_t data;
   
   data.done   = 0;
@@ -94,7 +92,7 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
   data.tensor = tensor;
   
   pthread_mutex_init(&tube_lock, NULL);
-
+  
   for (i = 0; i < thread_count; ++i) {
     if (0 != (error = pthread_create(&threads[i], NULL, fiber_product, &data))) {
       die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error);
@@ -114,23 +112,20 @@ void
 n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
   uint   i, j, k, index;
-  uint   m, n, l;
+  uint   n;
   uint   *P;
   double **M, *T;
   
   debug("n_mode_product_array(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
   
+  n = tensor->n;
   M = matrix->data;
   P = vector->data;
   T = tensor->values;
   
-  l = tensor->l;
-  m = tensor->m;
-  n = tensor->n;
-  
-  for (i = 0; i < m; ++i) {
+  for (i = 0; i < n; ++i) {
     for (j = 0; j < n; ++j) {
-      for (k = 0; k < l; ++k) {
+      for (k = 0; k < n; ++k) {
 	index = tensor_index(tensor, i, j, k);
 	M[i][j] += P[k] * T[index];
       }
@@ -147,7 +142,7 @@ threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const
   
   switch (tensor->strategy) {
   case strategy::array:
-    n_mode_product_array(matrix, vector, tensor);
+    threaded_n_mode_product_array(matrix, vector, tensor);
     break;
   default:
     die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n",

From b2eeb3c43c2740411f798b41846a96f9c77ef49d Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 7 Nov 2011 22:25:42 -0700
Subject: [PATCH 20/57] + Added a local sum variable to use to later write to
 the shared matrix data structure

---
 src/operation_threaded_n_mode_product.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc
index b0a411d..557874e 100644
--- a/src/operation_threaded_n_mode_product.cc
+++ b/src/operation_threaded_n_mode_product.cc
@@ -53,7 +53,7 @@ fiber_product(void *arg)
 {
   int                   t;
   uint                  i, j, k, offset;
-  uint                  n;
+  uint                  n, sum;
   uint                  *P;
   double                **M, *T;
   product_thread_data_t *p;
@@ -67,12 +67,14 @@ fiber_product(void *arg)
   n = p->tensor->n;
   
   while (-1 != (t = next_tube(p))) {
+    sum    = 0;
     offset = t*n;
     i      = t/n;
     j      = t%n;
     for (k = 0; k < n; ++k) {
-      M[i][j] += P[k] * T[offset+k];
+      sum += P[k] * T[offset+k];
     }
+    M[i][j] = sum;
   }
   
   return NULL;

From d84724382e5b7e75457eeb6bbaec9827940a7c0b Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Tue, 8 Nov 2011 04:59:42 -0700
Subject: [PATCH 21/57] + Stride based access to tubes, to improve cache
 performance

---
 src/operation_threaded_n_mode_product.cc | 97 +++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 4 deletions(-)

diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc
index 557874e..071d77b 100644
--- a/src/operation_threaded_n_mode_product.cc
+++ b/src/operation_threaded_n_mode_product.cc
@@ -29,7 +29,9 @@ extern uint    thread_count;
 */
 
 typedef struct {
+  uint           *pdone;
   uint           done;
+  uint           id, offset, i;
   matrix_t       *matrix;
   vector_t const *vector;
   tensor_t const *tensor;
@@ -38,10 +40,11 @@ typedef struct {
 static pthread_mutex_t tube_lock;
 
 int
-next_tube(product_thread_data_t *p)
+serial_next_tube(product_thread_data_t *p)
 {
-  uint k;
+  uint   k;
   
+    
   pthread_mutex_lock(&tube_lock);
   k = p->done++;
   pthread_mutex_unlock(&tube_lock);
@@ -49,7 +52,7 @@ next_tube(product_thread_data_t *p)
 }
 
 void*
-fiber_product(void *arg)
+serial_fiber_product(void *arg)
 {
   int                   t;
   uint                  i, j, k, offset;
@@ -66,7 +69,7 @@ fiber_product(void *arg)
   
   n = p->tensor->n;
   
-  while (-1 != (t = next_tube(p))) {
+  while (-1 != (t = serial_next_tube(p))) {
     sum    = 0;
     offset = t*n;
     i      = t/n;
@@ -80,6 +83,91 @@ fiber_product(void *arg)
   return NULL;
 }
 
+int
+padded_next_tube(product_thread_data_t *p)
+{
+  uint k, choise;
+  
+  if (p->i < 10) {
+    choise = p->offset + p->i++;
+  } else {
+    p->offset += p->i*p->tensor->n;
+    p->i       = 0;
+    choise     = p->offset;
+  }
+  
+  pthread_mutex_lock(&tube_lock);
+  k = (*p->pdone)++;
+  pthread_mutex_unlock(&tube_lock);
+  return k < (p->tensor->n*p->tensor->n) ? choise : -1;
+}
+
+void*
+padded_fiber_product(void *arg)
+{
+  int                   t;
+  uint                  i, j, k, offset;
+  uint                  n, sum;
+  uint                  *P;
+  double                **M, *T;
+  product_thread_data_t *p;
+  
+  p = (product_thread_data_t*) arg;
+  
+  M = p->matrix->data;
+  P = p->vector->data;
+  T = p->tensor->values;
+  n = p->tensor->n;
+  
+  while (-1 != (t = padded_next_tube(p))) {
+    sum    = 0;
+    offset = t*n;
+    i      = t/n;
+    j      = t%n;
+    for (k = 0; k < n; ++k) {
+      sum += P[k] * T[offset+k];
+    }
+    M[i][j] = sum;
+  }
+  
+  return NULL;
+}
+
+void
+threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+{
+  uint                  i, done;
+  uint                  n;
+  pthread_t             threads[32];
+  int                   error;
+  product_thread_data_t data[32];
+  
+  n = tensor->n;
+  pthread_mutex_init(&tube_lock, NULL);
+  
+  for (i = 0; i < thread_count; ++i) {
+    data[i].pdone   = &done;
+    data[i].matrix = matrix;
+    data[i].vector = vector;
+    data[i].tensor = tensor;
+    data[i].offset = i*n;
+    data[i].i      = 0;
+    data[i].id     = i;
+    if (0 != (error = pthread_create(&threads[i], NULL, padded_fiber_product, &data[i]))) {
+      die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error);
+    }
+  }
+  
+  for (i = 0; i < thread_count; ++i) {
+    if (0 != (error = pthread_join(threads[i], NULL))) {
+      die("threaded_n_mode_product_array: pthread_join() failed with %d\n", error);
+    }
+  }
+  
+  pthread_mutex_destroy(&tube_lock);
+}
+
+#if 0
 void
 threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
@@ -109,6 +197,7 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
   
   pthread_mutex_destroy(&tube_lock);
 }
+#endif
 
 void
 n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)

From 19fb803312972af18e07a5e1311cabdb85c40729 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Tue, 8 Nov 2011 05:33:44 -0700
Subject: [PATCH 22/57] + Test padding for improved scaling

---
 src/operation_threaded_n_mode_product.cc | 39 +++++++++++++++---------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc
index 071d77b..c0d4e4e 100644
--- a/src/operation_threaded_n_mode_product.cc
+++ b/src/operation_threaded_n_mode_product.cc
@@ -29,9 +29,9 @@ extern uint    thread_count;
 */
 
 typedef struct {
-  uint           *pdone;
+  uint           *pdone, *dummy;
   uint           done;
-  uint           id, offset, i;
+  uint           id, offset, i, stride;
   matrix_t       *matrix;
   vector_t const *vector;
   tensor_t const *tensor;
@@ -42,9 +42,8 @@ static pthread_mutex_t tube_lock;
 int
 serial_next_tube(product_thread_data_t *p)
 {
-  uint   k;
+  uint k;
   
-    
   pthread_mutex_lock(&tube_lock);
   k = p->done++;
   pthread_mutex_unlock(&tube_lock);
@@ -88,14 +87,16 @@ padded_next_tube(product_thread_data_t *p)
 {
   uint k, choise;
   
-  if (p->i < 10) {
+  if (p->i < p->stride) {
     choise = p->offset + p->i++;
   } else {
-    p->offset += p->i*p->tensor->n;
-    p->i       = 0;
+    p->offset += p->stride;
+    p->i       = 1;
     choise     = p->offset;
   }
   
+  //message("offset=%d\n", p->offset);
+  
   pthread_mutex_lock(&tube_lock);
   k = (*p->pdone)++;
   pthread_mutex_unlock(&tube_lock);
@@ -136,24 +137,32 @@ padded_fiber_product(void *arg)
 void
 threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
+  uint                  stride;
   uint                  i, done;
   uint                  n;
   pthread_t             threads[32];
   int                   error;
   product_thread_data_t data[32];
   
+  //message("sizeof(data)=%d\n", sizeof(data));
+  
   n = tensor->n;
   pthread_mutex_init(&tube_lock, NULL);
   
+  done         = 0;
+  stride       = 32 > tensor->n ? tensor->n : 2;
+  thread_count = thread_count > tensor->n ? tensor->n : thread_count;
+  
   for (i = 0; i < thread_count; ++i) {
-    data[i].pdone   = &done;
-    data[i].matrix = matrix;
-    data[i].vector = vector;
-    data[i].tensor = tensor;
-    data[i].offset = i*n;
-    data[i].i      = 0;
-    data[i].id     = i;
-    if (0 != (error = pthread_create(&threads[i], NULL, padded_fiber_product, &data[i]))) {
+    data[i+2].pdone  = &done;
+    data[i+2].matrix = matrix;
+    data[i+2].vector = vector;
+    data[i+2].tensor = tensor;
+    data[i+2].offset = i*stride;
+    data[i+2].i      = 0;
+    data[i+2].stride = stride;
+    data[i+2].id     = i;
+    if (0 != (error = pthread_create(&threads[i], NULL, serial_fiber_product, &data[i+2]))) {
       die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error);
     }
   }

From bfca0f6c362a84879522b4091947bf03f8e86750 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Tue, 8 Nov 2011 19:51:05 -0700
Subject: [PATCH 23/57] + Added simplified threading model

---
 src/Makefile  |   6 +-
 src/thread.cc | 213 +++++++++++++++++++++++++++++++++++++
 src/thread.h  | 283 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 499 insertions(+), 3 deletions(-)
 create mode 100644 src/thread.cc
 create mode 100644 src/thread.h

diff --git a/src/Makefile b/src/Makefile
index 6265660..91406cc 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -20,8 +20,8 @@ EXTRA_LDFLAGS=-Wall -lpthread $(EXTRA_DEBUG)
 
 HEADERS_CACHE=address.h cache.h hash.h
 HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h	\
-	 memory.h operation.h random.h strings.h timer.h tool.h		\
-	utility.h compatible.h
+	memory.h operation.h random.h thread.h strings.h timer.h	\
+	tool.h utility.h compatible.h
 HEADERS_GENERATE=generate.h
 HEADERS_MATRIX=matrix.h mmio.h
 HEADERS_TENSOR=storage.h tensor.h
@@ -34,7 +34,7 @@ SOURCES_GENERAL=arithmetic.cc compatible.cc error.cc file.cc		\
 	information.cc latex.cc memory.cc mmio.cc			\
 	operation_n_mode_product.cc					\
 	operation_threaded_n_mode_product.cc operation_utility.cc	\
-	random.cc strings.cc timer.cc tool_convert.cc			\
+	random.cc strings.cc thread.cc timer.cc tool_convert.cc		\
 	tool_effectuate.cc tool_generate.cc tool_permute.cc		\
 	tool_timing.cc tool_utility.cc types.cc utility.cc
 SOURCES_GENERATE=generate_tensor_from_matrix.cc
diff --git a/src/thread.cc b/src/thread.cc
new file mode 100644
index 0000000..eec77dd
--- /dev/null
+++ b/src/thread.cc
@@ -0,0 +1,213 @@
+/***********************************************************************
+ * pt.c -- thread utility routines
+ *
+ * Author: Mark Hays <hays@math.arizona.edu>
+ */
+
+#include "thread.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>	/* for EBUSY */
+
+/*************************************************
+ * attempt to lock a mutex
+ */
+int thread_mutex_trylock(pthread_mutex_t *m)
+{
+  int res;
+
+  /* returns EBUSY if mutex is already locked,
+   * and EINVAL if the ptr is bad (on RedHat5.0)
+   *
+   * might this return EAGAIN on some systems??
+   * i can't find any docs on this one's retval!
+   *
+   */
+  if ((res=pthread_mutex_trylock(m)) != EBUSY) {
+    THREAD_DIE("thread_mutex_trylock",res);
+  }
+  return(res ? 1 : 0);
+}
+
+void 
+thread_wait(pthread_t *thread, thread_address_t exitcode)
+{
+  thread_address_t code, p;
+  int errcode; 
+  
+  p = (thread_address_t) ((exitcode)==NULL ? &code : exitcode);
+  if ((errcode=pthread_join(*thread, &p))) { 
+    THREAD_DIE("thread_wait", errcode);
+  }
+}
+
+/*************************************************
+ * run nthreads threads in the routine start
+ */
+void _thread_fork(int nthreads,
+	      thread_function_t start,
+	      thread_address_t arg,
+	      thread_address_t *exitcodes)
+{
+  int i;
+  thread_argument_t *args;
+  thread_address_t *address;
+  
+  if (nthreads<1) {
+    die("thread_mutex_trylock: nthreads<1\n");
+  }
+  if ((args=(thread_argument_t *) malloc(nthreads*sizeof(thread_argument_t)))==NULL) {
+    die("thread_fork: malloc failed!\n");
+  }
+  for (i=0; i<nthreads; i++) {
+    args[i].nthreads=nthreads; args[i].myid=i; args[i].data=arg;
+  }
+  for (i=0; i<nthreads; i++) {
+    thread_create(&args[i].self,start,args+i);
+  }
+  for (i=0; i<nthreads; i++) {
+    address = (exitcodes==NULL?NULL:exitcodes+i);
+    thread_wait(&args[i].self,address);
+  }
+  free(args);
+}
+
+/*************************************************
+ * initialize a gate
+ */
+void thread_gate_init(thread_gate_t *gate,int nthreads)
+{
+  gate->ngate=0; gate->nthreads=nthreads;
+  thread_mutex_init(  &gate->mutex);
+  thread_mutex_init(  &gate->block);
+  thread_cond_init (&gate->condvar);
+  thread_cond_init (   &gate->last);
+}
+
+/*************************************************
+ * destroy a gate variable
+ */
+void thread_gate_destroy(thread_gate_t *gate)
+{
+  gate->ngate=gate->nthreads=0;
+  thread_mutex_destroy(  &gate->mutex);
+  thread_mutex_destroy(  &gate->block);
+  thread_cond_destroy (&gate->condvar);
+  thread_cond_destroy (   &gate->last);
+}
+
+/*************************************************
+ * enter the gate
+ */
+void thread_gate_sync(thread_gate_t *gate)
+{
+  if (gate->nthreads<2) return;           /* trivial case            */
+  thread_mutex_lock(&gate->block);             /* lock the block -- new
+					      threads sleep here  */
+  thread_mutex_lock(&gate->mutex);             /* lock the mutex          */
+  if (++(gate->ngate) < gate->nthreads) { /* are we the last one in? */
+    thread_mutex_unlock(&gate->block);         /* no, unlock block and    */
+    thread_cond_wait(&gate->condvar,          /*   go to sleep           */
+		 &gate->mutex);
+  } else {                                /* yes, we're last         */
+    thread_cond_broadcast(&gate->condvar);     /* wake everyone up and    */
+    thread_cond_wait(&gate->last,&gate->mutex); /* go to sleep til they're
+					       all awake... then  */
+    thread_mutex_unlock(&gate->block);         /* release the block       */
+  }
+  if (--(gate->ngate)==1) {               /* next to last one out?   */
+    thread_cond_broadcast(&gate->last);        /* yes, wake up last one   */
+  }
+  thread_mutex_unlock(&gate->mutex);           /* release the mutex       */
+}
+
+/*************************************************
+ * Pipeline stage: the idea:
+ *
+ *   main thread   I/O thread
+ *           \      /      \
+ *            \    /        \
+ *            gate1         |
+ *           /     \        |
+ *          /       \       |
+ *       setup       |     work
+ *          \       /       |
+ *           \     /        |
+ *            gate2         |
+ *           /    \         /
+ *          /      \_______/
+ *         |
+ *   main continues
+ */
+
+/*************************************************
+ * couple of convenient macros
+ */
+#define GATE1(pipeline) thread_gate_sync(&((pipeline)->gate1))
+#define GATE2(pipeline) thread_gate_sync(&((pipeline)->gate2))
+#define STAGE(pipeline) (*((pipeline)->stageproc))((pipeline)->gdata)
+#define SETUP(pipeline) \
+  { thread_function_t fp; \
+    \
+    if ((fp=(pipeline)->setupproc)!=NULL) (*fp)(pipeline->gdata); \
+  }
+
+/*************************************************
+ * slave thread executes this
+ */
+static void _thread_pipeline_slave_code(thread_pipeline_t *pipeline)
+{
+  while (1) {
+    GATE1(pipeline);
+    if (pipeline->terminate) break;
+    GATE2(pipeline);
+    STAGE(pipeline);
+  }
+  thread_exit(NULL);
+}
+
+/*************************************************
+ * init the info struct and start up the slave
+ */
+void _thread_pipeline_init(thread_pipeline_t *pipeline,
+		       thread_address_t gdata,
+		       thread_function_t setupproc,
+		       thread_function_t stageproc)
+{
+  thread_gate_init(&(pipeline->gate1),2);
+  thread_gate_init(&(pipeline->gate2),2);
+  pipeline->terminate=0;
+  pipeline->gdata=gdata;
+  pipeline->setupproc=setupproc;
+  pipeline->stageproc=stageproc;
+  thread_create(&(pipeline->slave),_thread_pipeline_slave_code,pipeline);
+}
+
+/*************************************************
+ * kill the slave, free resources
+ */
+void thread_pipeline_destroy(thread_pipeline_t *pipeline)
+{
+  pipeline->terminate=1;
+  GATE1(pipeline);
+  thread_wait(&(pipeline->slave),NULL);
+  thread_gate_destroy(&(pipeline->gate1));
+  thread_gate_destroy(&(pipeline->gate2));
+  pipeline->gdata=NULL;
+  pipeline->setupproc=NULL;
+  pipeline->stageproc=NULL;
+}
+
+/*************************************************
+ * run the pipeline stage
+ */
+void thread_pipeline_execute(thread_pipeline_t *pipeline)
+{
+  GATE1(pipeline);
+  SETUP(pipeline);
+  GATE2(pipeline);
+}
+
+/* EOF pt.c */
+
diff --git a/src/thread.h b/src/thread.h
new file mode 100644
index 0000000..8099434
--- /dev/null
+++ b/src/thread.h
@@ -0,0 +1,283 @@
+/***********************************************************************
+ * thread.h, based on:
+ * pt.h -- pthreads utility macros
+ *
+ * Author: Mark Hays <hays@math.arizona.edu>
+ */
+
+#ifndef _THREAD_H_
+#define _THREAD_H_
+
+/* Linux defs:
+ *   _REENTRANT to get thread-safe libs
+ *   _POSIX_SOURCE to get POSIX semantics
+ *   _P is a hack for LinuxThreads -- on my box,
+ *      pthread.h includes sched.h. My sched.h
+ *      (incorrectly) declares prototypes with
+ *      _P instead of __P (which is what everything
+ *      else uses... Maybe it's just me.
+ */
+#ifdef __linux__
+#  define _REENTRANT
+#  define _POSIX_SOURCE
+#  define _P __P
+#endif
+
+#include <pthread.h>
+#include <string.h>
+#include "error.h"
+
+typedef void *(*thread_function_t)(void *);
+typedef void *thread_address_t;
+
+#define THREAD_DIE(func,errcode)					\
+  die("%s:%d: %s: %s.\n",__FILE__,__LINE__,func,strerror(errcode));
+
+/*************************************************
+ * low level wrappers that die on errors
+ */
+#define thread_create(t,start,arg) \
+{ \
+    int errcode; \
+    \
+    if ((errcode=pthread_create(t, \
+			       NULL, \
+			       (thread_function_t) (start), \
+			       (thread_address_t) (arg)))) { \
+      THREAD_DIE("thread_create", errcode);		    \
+    }							    \
+}
+
+#define thread_create_detached(start,arg) \
+{ \
+    pthread_t t; \
+    int errcode; \
+    \
+    if ((errcode=pthread_create(&t, \
+			       NULL, \
+			       (thread_function_t) (start), \
+			       (thread_address_t) (arg)))) { \
+      THREAD_DIE("thread_create_detached", errcode);		    \
+    }							    \
+    if (pthread_detach(t)) {				    \
+      THREAD_DIE("thread_create_detached", errcode);		    \
+    }							    \
+}
+
+void thread_wait(pthread_t thread, thread_address_t exitcode);
+
+#if 0
+#define thread_wait(t,exitcode) \
+{ \
+    thread_address_t code; \
+    int errcode; \
+    \
+    if ((errcode=pthread_join(*(t), \
+		      (thread_address_t) ((exitcode)==NULL ? &code : (exitcode))))) { \
+      THREAD_DIE("thread_wait", errcode);				\
+    }									\
+}
+#endif
+
+#define thread_exit(status) \
+{ \
+    pthread_exit(status); \
+}
+
+#define thread_mutex_init(m) \
+{ \
+    int errcode; \
+    \
+    if ((errcode=pthread_mutex_init(m,NULL))) { \
+      THREAD_DIE("thread_mutex_init", errcode);   \
+    }					      \
+}
+
+#define thread_mutex_destroy(m) \
+{ \
+    int errcode; \
+    \
+    if ((errcode=pthread_mutex_destroy(m))) { \
+        THREAD_DIE("thread_mutex_destroy", errcode); \
+    }					      \
+}
+
+#define thread_cond_init(c) \
+{ \
+    int errcode; \
+    \
+    if ((errcode=pthread_cond_init(c,NULL))) { \
+        THREAD_DIE("thread_cond_init", errcode); \
+    }					      \
+}
+
+#define thread_cond_destroy(c) \
+{ \
+    int errcode; \
+    \
+    if ((errcode=pthread_cond_destroy(c))) { \
+        THREAD_DIE("thread_cond_destroy", errcode); \
+    }					      \
+}
+
+#define thread_mutex_lock(m) \
+{ \
+    int errcode; \
+    \
+    if ((errcode=pthread_mutex_lock(m))) { \
+        THREAD_DIE("thread_mutex_lock", errcode); \
+    }					       \
+}
+
+/* This one has to do some extra checking so it
+ * isn't a macro...
+ */
+extern int thread_mutex_trylock(pthread_mutex_t *m, char *msg);
+
+#define thread_mutex_unlock(m) \
+{ \
+    int errcode; \
+    \
+    if ((errcode=pthread_mutex_unlock(m))) { \
+        THREAD_DIE("thread_mutex_unlock", errcode); \
+    }					      \
+}
+
+#define thread_cond_wait(c,m) \
+{ \
+    int errcode; \
+    \
+    if ((errcode=pthread_cond_wait(c,m))) { \
+        THREAD_DIE("thread_cond_wait", errcode); \
+    }					      \
+}
+
+#define thread_cond_broadcast(c) \
+{ \
+    int errcode; \
+    \
+    if ((errcode=pthread_cond_broadcast(c))) { \
+        THREAD_DIE("thread_cond_broadcast", errcode); \
+    }					      \
+}
+
+#define thread_cond_signal(c) \
+{ \
+    int errcode; \
+    \
+    if ((errcode=pthread_cond_signal(c))) { \
+      THREAD_DIE("thread_cond_signal", errcode);	\
+    }						\
+}
+
+/*************************************************
+ * N threads simultaneously doing the same thing
+ */
+
+typedef struct _thread_argument_t_ {
+  int myid;
+  int nthreads;
+  pthread_t self;
+  thread_address_t data;
+} thread_argument_t;
+
+#define thread_myid(th_arg)      ((th_arg)->myid)
+#define thread_nthreads(th_arg)  ((th_arg)->nthreads)
+#define thread_data(th_arg)      ((th_arg)->data)
+#define thread_self(th_arg)      (&((th_arg)->self))
+#define thread_thread(th_arg,id) (&(((th_arg)-((th_arg)->myid)+(id))->self))
+
+#define thread_cancel(th_arg,id) \
+{ \
+    int errcode; \
+    \
+    if ((errcode=pthread_cancel(((th_arg)-((th_arg)->myid)+(id))->self))) {	\
+        THREAD_DIE("thread_cancel", errcode); \
+    }					       \
+}
+
+#define thread_cancel_all(th_arg) \
+{ \
+    int myid=(th_arg)->myid,nt=(th_arg)->nthreads,i,errcode; \
+    thread_argument_t *base=(th_arg)-myid; \
+    \
+    for (i=0; i<nt; i++) { \
+        if (i==id) continue; \
+        if ((errcode=pthread_cancel(base[i].self))) { \
+            THREAD_DIE("thread_cancel_all", errcode); \
+	}					   \
+    } \
+}
+
+extern void _thread_fork(int nthreads,
+		     thread_function_t start,
+		     thread_address_t arg,
+		     thread_address_t *exitcodes);
+
+#define thread_fork(nt,start,arg,codes) \
+  _thread_fork(nt,(thread_function_t) start, \
+	   (thread_address_t) arg,(thread_address_t *) codes)
+
+/*************************************************
+ * the gate struct (rendezvous point)
+ */
+typedef struct _thread_gate_t_ {
+  int ngate;              
+  int nthreads;           
+  pthread_mutex_t mutex;  
+  pthread_mutex_t block;  
+  pthread_cond_t condvar; 
+  pthread_cond_t last;    
+} thread_gate_t;
+
+extern void thread_gate_init(thread_gate_t *gate,int nthreads);
+extern void thread_gate_destroy(thread_gate_t *gate);
+extern void thread_gate_sync(thread_gate_t *gate);
+
+/*************************************************
+ * the pipeline struct (a single stage)
+ */
+typedef struct _thread_pipeline_t_ {
+  thread_address_t gdata;
+  thread_function_t setupproc;
+  thread_function_t stageproc;
+  int terminate;
+  pthread_t slave;
+  thread_gate_t gate1;
+  thread_gate_t gate2;
+} thread_pipeline_t;
+
+extern void _thread_pipeline_init(thread_pipeline_t *p,
+			      thread_address_t gdata,
+			      thread_function_t setup,
+			      thread_function_t stage);
+
+#define thread_pipeline_init(p,gdata,setup,stage)  \
+  _thread_pipeline_init(p,                         \
+		    (thread_address_t) gdata,         \
+		    (thread_function_t) setup, \
+		    (thread_function_t) stage)
+
+extern void thread_pipeline_destroy(thread_pipeline_t *p);
+extern void thread_pipeline_execute(thread_pipeline_t *p);
+
+/* names of C functions that can be called from
+ * FORTRAN without the trailing underscore(s)
+ */
+
+#ifdef __linux__
+   /* This works on my LinuxThreads + g77-0.5.21 */
+#  define F77PIPELINEINIT pipeline_init__
+#  define F77PIPELINEDONE pipeline_done__
+#  define F77PIPELINEEXEC pipeline_execute__
+#else
+   /* This seems to work everywhere else */
+#  define F77PIPELINEINIT pipeline_init_
+#  define F77PIPELINEDONE pipeline_done_
+#  define F77PIPELINEEXEC pipeline_execute_
+#endif
+
+#endif /* _THREAD_H_ */
+
+/* EOF pt.h */
+

From 5ca3d6f69344ccc0fd2ddb57bb3e2f820149ee6d Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Tue, 8 Nov 2011 19:52:41 -0700
Subject: [PATCH 24/57] + Added stride parameter, to allow seperat threads to
 access memory that is far away for other threads

---
 src/main.cc            |  1 +
 src/tool.h             |  1 +
 src/tool_effectuate.cc | 18 ++++++++++++++----
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/main.cc b/src/main.cc
index 2b893fa..ee64dec 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -21,6 +21,7 @@ cache_t           *cache;
 uint              cache_size;
 uint              cache_line_size;
 uint              iterations;
+uint              memory_stride;
 uint              seed;
 uint              thread_count;
 char              *tool_name;
diff --git a/src/tool.h b/src/tool.h
index 851792d..f687070 100644
--- a/src/tool.h
+++ b/src/tool.h
@@ -24,6 +24,7 @@ namespace tool {
 
 #define DEFAULT_HUMAN_READABLE        true
 #define DEFAULT_ITERATIONS            1
+#define DEFAULT_MEMORY_STRIDE         32
 #define DEFAULT_OPERATION             operation::n_mode_product
 #define DEFAULT_ORIENTATION           orientation::row
 #define DEFAULT_PERMUTATION_HEURISTIC permutation_heuristic::none
diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc
index 857c0cc..314d116 100644
--- a/src/tool_effectuate.cc
+++ b/src/tool_effectuate.cc
@@ -21,8 +21,9 @@
 extern cache_t           *cache;
 extern uint              cache_size;
 extern uint              cache_line_size;
-extern uint              iterations;
 extern bool              human_readable;
+extern uint              iterations;
+extern uint              memory_stride;
 extern uint              thread_count;
 extern char              *tool_name;
 extern tool::type_t      tool_type;
@@ -179,14 +180,16 @@ effectuate_tool_main(int argc, char *argv[])
   int c;
   
   /* set the program's defaults */
-  optcode      = DEFAULT_OPERATION;
-  thread_count = DEFAULT_THREAD_COUNT;
+  memory_stride = DEFAULT_MEMORY_STRIDE;
+  optcode       = DEFAULT_OPERATION;
+  thread_count  = DEFAULT_THREAD_COUNT;
+  
   
   /* we will privide our own error messages */
   opterr = 0;
   
   /* extract any command-line options the user provided */
-  while (-1 != (c = getopt(argc, argv, ":hl:m:n:o:p:st:TuvV:w"))) {
+  while (-1 != (c = getopt(argc, argv, ":hl:m:n:o:p:r:st:TuvV:w"))) {
     switch (c) {
     case 'h': 
       effectuate_tool_usage();
@@ -216,6 +219,12 @@ effectuate_tool_main(int argc, char *argv[])
 	optcode = string_to_operation(optarg);
       }
       break;
+    case 'r':
+      memory_stride = atoi(optarg);
+      if (0 == memory_stride) {
+	memory_stride = DEFAULT_MEMORY_STRIDE;
+      }
+      break;
     case 's':
       simulate = !simulate;
       break;
@@ -267,6 +276,7 @@ effectuate_tool_main(int argc, char *argv[])
   /* print program options, for debugging purposes */
   print_tool_options();
   debug("effectuate_tool_main: operation='%s'\n", operation_to_string(optcode));
+  debug("effectuate_tool_main: memory_stride=%d\n", memory_stride);
   debug("effectuate_tool_main: thread_count=%d\n", thread_count);
   
   /* if we are just running a simulation, then we only do one

From 3f997364189ae8f0c76a4e14d7578e7b9d1615a9 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Tue, 8 Nov 2011 19:54:24 -0700
Subject: [PATCH 25/57] + Updated tensor product to use siple threading model

---
 src/operation_threaded_n_mode_product.cc | 133 +++++------------------
 1 file changed, 29 insertions(+), 104 deletions(-)

diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc
index c0d4e4e..181696f 100644
--- a/src/operation_threaded_n_mode_product.cc
+++ b/src/operation_threaded_n_mode_product.cc
@@ -5,13 +5,14 @@
 #include "matrix.h"
 #include "operation.h"
 #include "tensor.h"
+#include "thread.h"
 #include "utility.h"
 #include "vector.h"
 #include <stdio.h>
 #include <stdlib.h>
-#include <pthread.h>
 
 extern cache_t *cache;
+extern uint    memory_stride;
 extern uint    thread_count;
 
 /*
@@ -29,9 +30,7 @@ extern uint    thread_count;
 */
 
 typedef struct {
-  uint           *pdone, *dummy;
   uint           done;
-  uint           id, offset, i, stride;
   matrix_t       *matrix;
   vector_t const *vector;
   tensor_t const *tensor;
@@ -40,87 +39,34 @@ typedef struct {
 static pthread_mutex_t tube_lock;
 
 int
-serial_next_tube(product_thread_data_t *p)
+serial_next_tube(product_thread_data_t *data)
 {
   uint k;
   
-  pthread_mutex_lock(&tube_lock);
-  k = p->done++;
-  pthread_mutex_unlock(&tube_lock);
-  return k < (p->tensor->n*p->tensor->n) ? k : -1;
+  thread_mutex_lock(&tube_lock);
+  k = data->done++;
+  thread_mutex_unlock(&tube_lock);
+  return k < (data->tensor->n*data->tensor->n) ? k : -1;
 }
 
-void*
-serial_fiber_product(void *arg)
+thread_address_t
+serial_fiber_product(thread_argument_t *argument)
 {
   int                   t;
   uint                  i, j, k, offset;
   uint                  n, sum;
   uint                  *P;
   double                **M, *T;
-  product_thread_data_t *p;
+  product_thread_data_t *data;
   
-  p = (product_thread_data_t*) arg;
+  data = (product_thread_data_t*) thread_data(argument);
   
-  M = p->matrix->data;
-  P = p->vector->data;
-  T = p->tensor->values;
+  n = data->tensor->n;
+  M = data->matrix->data;
+  P = data->vector->data;
+  T = data->tensor->values;
   
-  n = p->tensor->n;
-  
-  while (-1 != (t = serial_next_tube(p))) {
-    sum    = 0;
-    offset = t*n;
-    i      = t/n;
-    j      = t%n;
-    for (k = 0; k < n; ++k) {
-      sum += P[k] * T[offset+k];
-    }
-    M[i][j] = sum;
-  }
-  
-  return NULL;
-}
-
-int
-padded_next_tube(product_thread_data_t *p)
-{
-  uint k, choise;
-  
-  if (p->i < p->stride) {
-    choise = p->offset + p->i++;
-  } else {
-    p->offset += p->stride;
-    p->i       = 1;
-    choise     = p->offset;
-  }
-  
-  //message("offset=%d\n", p->offset);
-  
-  pthread_mutex_lock(&tube_lock);
-  k = (*p->pdone)++;
-  pthread_mutex_unlock(&tube_lock);
-  return k < (p->tensor->n*p->tensor->n) ? choise : -1;
-}
-
-void*
-padded_fiber_product(void *arg)
-{
-  int                   t;
-  uint                  i, j, k, offset;
-  uint                  n, sum;
-  uint                  *P;
-  double                **M, *T;
-  product_thread_data_t *p;
-  
-  p = (product_thread_data_t*) arg;
-  
-  M = p->matrix->data;
-  P = p->vector->data;
-  T = p->tensor->values;
-  n = p->tensor->n;
-  
-  while (-1 != (t = padded_next_tube(p))) {
+  while (-1 != (t = serial_next_tube(data))) {
     sum    = 0;
     offset = t*n;
     i      = t/n;
@@ -137,43 +83,22 @@ padded_fiber_product(void *arg)
 void
 threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
-  uint                  stride;
-  uint                  i, done;
-  uint                  n;
-  pthread_t             threads[32];
-  int                   error;
-  product_thread_data_t data[32];
-  
-  //message("sizeof(data)=%d\n", sizeof(data));
+  product_thread_data_t data;
   
-  n = tensor->n;
-  pthread_mutex_init(&tube_lock, NULL);
+  memory_stride = memory_stride > tensor->n ? tensor->n : memory_stride;
+  thread_count  = thread_count > tensor->n ? tensor->n : thread_count;
   
-  done         = 0;
-  stride       = 32 > tensor->n ? tensor->n : 2;
-  thread_count = thread_count > tensor->n ? tensor->n : thread_count;
+  debug("threaded_n_mode_product_array: memory_stride=%d\n", memory_stride);
+  debug("threaded_n_mode_product_array: thread_count=%d\n", thread_count);
   
-  for (i = 0; i < thread_count; ++i) {
-    data[i+2].pdone  = &done;
-    data[i+2].matrix = matrix;
-    data[i+2].vector = vector;
-    data[i+2].tensor = tensor;
-    data[i+2].offset = i*stride;
-    data[i+2].i      = 0;
-    data[i+2].stride = stride;
-    data[i+2].id     = i;
-    if (0 != (error = pthread_create(&threads[i], NULL, serial_fiber_product, &data[i+2]))) {
-      die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error);
-    }
-  }
-  
-  for (i = 0; i < thread_count; ++i) {
-    if (0 != (error = pthread_join(threads[i], NULL))) {
-      die("threaded_n_mode_product_array: pthread_join() failed with %d\n", error);
-    }
-  }
+  data.done   = 0;
+  data.matrix = matrix;
+  data.vector = vector;
+  data.tensor = tensor;
   
-  pthread_mutex_destroy(&tube_lock);
+  thread_mutex_init(&tube_lock);
+  thread_fork(thread_count, serial_fiber_product, &data, NULL);
+  thread_mutex_destroy(&tube_lock);
 }
 
 #if 0
@@ -193,7 +118,7 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
   pthread_mutex_init(&tube_lock, NULL);
   
   for (i = 0; i < thread_count; ++i) {
-    if (0 != (error = pthread_create(&threads[i], NULL, fiber_product, &data))) {
+    if (0 != (error = pthread_create(&threads[i], NULL, (void* (*)(void*))serial_fiber_product, &data))) {
       die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error);
     }
   }

From b354da5972b51bc0bf3156679d2e2c41a75278ec Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Wed, 9 Nov 2011 12:11:58 -0700
Subject: [PATCH 26/57] + Removed sparse tensor support

---
 src/Makefile                             |  32 +-
 src/compatible.cc                        |   4 -
 src/generate_tensor_from_matrix.cc       |  17 +-
 src/information.cc                       |   2 +-
 src/main.cc                              |   2 -
 src/operation_n_mode_product.cc          | 460 +++++------------------
 src/operation_threaded_n_mode_product.cc | 177 ---------
 src/tensor.h                             |  66 +---
 src/tensor_clear.cc                      |  80 +---
 src/tensor_copy.cc                       |   3 +-
 src/tensor_emit_latex.cc                 |  57 +--
 src/tensor_free.cc                       |  68 ----
 src/tensor_malloc.cc                     |  19 +-
 src/tensor_read.cc                       |   8 +
 src/tensor_utility.cc                    |  53 ---
 src/tensor_validate.cc                   |   4 -
 src/tensor_write.cc                      |   4 +
 src/tool.h                               |   6 +-
 18 files changed, 133 insertions(+), 929 deletions(-)
 delete mode 100644 src/operation_threaded_n_mode_product.cc

diff --git a/src/Makefile b/src/Makefile
index 91406cc..d59896a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -24,7 +24,7 @@ HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h	\
 	tool.h utility.h compatible.h
 HEADERS_GENERATE=generate.h
 HEADERS_MATRIX=matrix.h mmio.h
-HEADERS_TENSOR=storage.h tensor.h
+HEADERS_TENSOR=tensor.h
 HEADERS_VECTOR=vector.h
 HEADERS=$(HEADERS_CACHE) $(HEADERS_GENERAL) $(HEADERS_GENERATE)	\
 	$(HEADERS_MATRIX) $(HEADERS_TENSOR) $(HEADERS_VECTOR)
@@ -32,33 +32,23 @@ HEADERS=$(HEADERS_CACHE) $(HEADERS_GENERAL) $(HEADERS_GENERATE)	\
 SOURCES_CACHE=address.cc cache.cc hash.cc
 SOURCES_GENERAL=arithmetic.cc compatible.cc error.cc file.cc		\
 	information.cc latex.cc memory.cc mmio.cc			\
-	operation_n_mode_product.cc					\
-	operation_threaded_n_mode_product.cc operation_utility.cc	\
-	random.cc strings.cc thread.cc timer.cc tool_convert.cc		\
-	tool_effectuate.cc tool_generate.cc tool_permute.cc		\
-	tool_timing.cc tool_utility.cc types.cc utility.cc
+	operation_n_mode_product.cc operation_utility.cc random.cc	\
+	strings.cc thread.cc timer.cc tool_effectuate.cc		\
+	tool_generate.cc tool_timing.cc tool_utility.cc types.cc	\
+	utility.cc
 SOURCES_GENERATE=generate_tensor_from_matrix.cc
 SOURCES_MATRIX=matrix_arithmetic.cc matrix_clear.cc			\
 	matrix_compatible.cc matrix_copy.cc matrix_free.cc		\
 	matrix_malloc.cc matrix_partition.cc matrix_supported.cc	\
 	matrix_read.cc matrix_write.cc
-SOURCES_STORAGE=tensor_storage_convert.cc				\
-	tensor_storage_compressed.cc					\
-	tensor_storage_compressed_slice.cc				\
-	tensor_storage_coordinate.cc tensor_storage_matrix_slice.cc	\
-	tensor_emit_latex.cc tensor_storage_ekmr.cc			\
-	tensor_storage_gundersen.cc tensor_storage_malloc.cc		\
-	tensor_storage_utility.cc tensor_storage_zzekmr.cc
-SOURCES_TENSOR=tensor_arithmetic.cc tensor_clear.cc tensor_convert.cc	\
-	tensor_copy.cc tensor_free.cc tensor_malloc.cc			\
-	tensor_ownership.cc tensor_permute.cc tensor_supported.cc	\
-	tensor_read.cc tensor_write.cc tensor_utility.cc		\
-	tensor_validate.cc
+SOURCES_TENSOR=tensor_arithmetic.cc tensor_clear.cc tensor_copy.cc	\
+	tensor_emit_latex.cc tensor_free.cc tensor_malloc.cc		\
+	tensor_ownership.cc tensor_supported.cc tensor_read.cc		\
+	tensor_write.cc tensor_utility.cc tensor_validate.cc
 SOURCES_VECTOR=vector_clear.cc vector_free.cc vector_malloc.cc	\
 	vector_read.cc vector_write.cc
-SOURCES=$(SOURCES_CACHE) $(SOURCES_GENERAL) $(SOURCES_GENERATE)	\
-	$(SOURCES_MATRIX) $(SOURCES_STORAGE) $(SOURCES_TENSOR)	\
-	$(SOURCES_VECTOR) main.cc
+SOURCES=$(SOURCES_CACHE) $(SOURCES_GENERAL) $(SOURCES_GENERATE)		\
+	$(SOURCES_MATRIX) $(SOURCES_TENSOR) $(SOURCES_VECTOR) main.cc
 
 ASSEMBLER=$(SOURCES:.cc=.s)
 OBJECTS=$(ASSEMBLER:.s=.o)
diff --git a/src/compatible.cc b/src/compatible.cc
index 2f7ddfe..a46c7b5 100644
--- a/src/compatible.cc
+++ b/src/compatible.cc
@@ -87,10 +87,6 @@ compatible(tensor_t const *lhs, tensor_t const *rhs)
 {
   debug("compatible(tensor=0x%x, tensor=0x%x)\n", lhs, rhs);
   
-  if (lhs->nnz != rhs->nnz) {
-    die("Tensors do not have the same number non-zero entries.\n");
-  }
-  
   if (lhs->l != rhs->l || lhs->m != rhs->m || lhs->n != rhs->n) {
     die("Tensors do not have the same dimensions.\n");
   }
diff --git a/src/generate_tensor_from_matrix.cc b/src/generate_tensor_from_matrix.cc
index 468b4b9..2f4fd91 100644
--- a/src/generate_tensor_from_matrix.cc
+++ b/src/generate_tensor_from_matrix.cc
@@ -26,12 +26,11 @@
 tensor_t*
 generate_tensor_from_matrix(matrix_t *matrix)
 {
+#if 0
   uint                        i, j, k;
-  uint                        nnz, size, n;
+  uint                        size, n;
   uint                        lower, upper;
   tensor_t                    *tensor;
-  tensor_storage_coordinate_t *storage;
-  coordinate_tuple_t          *tuples;
   double                      *values;
   double                      **data;
   
@@ -40,15 +39,6 @@ generate_tensor_from_matrix(matrix_t *matrix)
   n     = matrix->n;
   upper = matrix->n*matrix->n;
   data  = matrix->data;
-  nnz   = 0;
-  
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n; ++j) {
-      if (!might_as_well_be_zero(data[i][j])) {
-	nnz++;
-      }
-    }
-  }
   
   lower   = nnz;
   nnz    *= n;
@@ -85,4 +75,7 @@ generate_tensor_from_matrix(matrix_t *matrix)
   }
   
   return tensor;
+#endif
+  
+  return NULL;
 }
diff --git a/src/information.cc b/src/information.cc
index 5425b9d..4a71dd7 100644
--- a/src/information.cc
+++ b/src/information.cc
@@ -20,6 +20,6 @@ void
 print_information(tensor_t const* tensor)
 {
   debug("print_information(tensor=0x%x)\n", tensor);
-  debug("l=%d, m=%d, n=%d, nnz=%d\n", tensor->l, tensor->m, tensor->n, tensor->nnz);
+  debug("l=%d, m=%d, n=%d, nnz=%d\n", tensor->l, tensor->m, tensor->n);
   debug("strategy='%s', orientation='%s'\n", strategy_to_string(tensor->strategy), orientation_to_string(tensor->orientation));
 }
diff --git a/src/main.cc b/src/main.cc
index ee64dec..b84b7bb 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -53,10 +53,8 @@ main(int argc, char *argv[])
   } entrypoints[] = {
     { NULL },
     { NULL },
-    { &convert_tool_main },
     { &generate_tool_main },
     { &effectuate_tool_main },
-    { &permute_tool_main },
     { NULL }
   };
   
diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index 9805e6b..c5b1e57 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -4,6 +4,7 @@
 #include "error.h"
 #include "matrix.h"
 #include "operation.h"
+#include "thread.h"
 #include "tensor.h"
 #include "utility.h"
 #include "vector.h"
@@ -11,8 +12,11 @@
 #include <stdlib.h>
 
 extern cache_t *cache;
+extern uint    memory_stride;
 extern uint    thread_count;
 
+static pthread_mutex_t tube_lock;
+
 /*
   Computing ($pT$):
   Let $\T \in R^{n\times n\times n}$ be a tensor.
@@ -27,394 +31,119 @@ extern uint    thread_count;
   end for
 */
 
-void
-compressed_row(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
-{
-  uint                       i, j, k;
-  uint                       rn, nnz;
-  uint                       start, end;
-  uint                       c, r, r0, t, m, n;
-  double                     **M;
-  double const               *V;
-  uint const                 *p, *R, *C, *T;
-  tensor_storage_compressed_t const *storage;
-  
-  debug("compressed_row(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
-  
-  p       = vector->data;
-  M       = matrix->data;
-  V       = tensor->values;
-  nnz     = tensor->nnz;
-  m       = matrix->m;
-  n       = matrix->n;
-  
-  storage = STORAGE_COMPRESSED(tensor);
-  rn      = storage->rn;
-  R       = storage->RO;
-  C       = storage->CO;
-  T       = storage->KO;
-  
-  /*
-    Using \emph{compressed row storage} ($\CRS$), this tensor can be
-    represented as:
-    
-           $k$   0   1   2    3   4   5   6    7   8   9   10   11
-     $\rowcrs$ & 0 & 4 & 8 & 12
-     $\colcrs$ & 1 & 3 & 0 &  2 & 0 & 2 & 1 &  2 & 1 & 2 &  0 &  3
-    $\tubecrs$ & 0 & 0 & 1 &  1 & 0 & 0 & 1 &  1 & 0 & 0 &  1 &  1
-     $\valcrs$ & 1 & 2 & 7 &  8 & 3 & 4 & 9 & 10 & 5 & 6 & 11 & 12
-  */
-  
-  DEBUG("\n");
-  
-  for (r = 1; r < rn; ++r) {
-    r0    = r-1;
-    i     = r0 % n;
-    start = R[r0];
-    end   = R[r];
-    
-    CACHE_ACCESS(cache, &R[r0], cache_operation::read, "R[r=%d]", r0);
-    CACHE_ACCESS(cache, &R[r],  cache_operation::read, "R[r=%d]", r);
-    
-    DEBUG("start=%d, end=%d\n", start, end);
-    
-    for (k = start; k < end; ++k) {
-      
-      c = C[k];
-      j = c;
-      t = T[k];
-      
-      DEBUG("i=%d, j=%d, t=%d, r=%d, c=%d, k=%d\n", i, j, t, r, c, k);
-      
-      CACHE_ACCESS(cache, &C[k], cache_operation::read, "C[k=%d]", k);
-      CACHE_ACCESS(cache, &T[k], cache_operation::read, "T[k=%d]", k);
-      
-      // trace("(M[i=%2d][j=%2d]=%2.0f += (p[t=%2d]=%2d * V[k=%2d]=%2.0f)=%2.0f))=%2.0f\n", i, j, M[i][j], t, p[t], k, V[k], p[t] * V[k], M[i][j] + p[t] * V[k]);
-      
-      M[i][j] += p[t] * V[k];
-      
-      CACHE_ACCESS(cache, &V[k],    cache_operation::read,  "V[k=%d]", k);
-      CACHE_ACCESS(cache, &p[t],    cache_operation::read,  "P[t=%d]", t);
-      CACHE_ACCESS(cache, &M[i][j], cache_operation::read,  "M[i=%d][j=%d]", i, j);
-      CACHE_ACCESS(cache, &M[i][j], cache_operation::write, "M[i=%d][j=%d]", i, j);
-      
-      CACHE_DEBUG(cache);
-    }
-  }
-}
-
-void
-compressed_tube(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
-{
-  uint                       i, j, k;
-  uint                       rn, nnz;
-  uint                       start, end;
-  uint                       c, r, r0, t, m, n;
-  double                     **M;
-  double const               *V;
-  uint const                 *p, *R, *C, *T;
-  tensor_storage_compressed_t const *storage;
-  
-  debug("compressed_tube(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
-  
-  p       = vector->data;
-  M       = matrix->data;
-  V       = tensor->values;
-  nnz     = tensor->nnz;
-  m       = matrix->m;
-  n       = matrix->n;
-  
-  storage = STORAGE_COMPRESSED(tensor);
-  rn      = storage->rn;
-  R       = storage->RO;
-  C       = storage->CO;
-  T       = storage->KO;
-  
-  /*
-    Using \emph{compressed row storage} ($\CRS$), this tensor can be
-    represented as:
-    
-           $k$   0   1   2    3   4   5   6    7   8   9   10   11
-     $\rowcrs$ & 0 & 4 & 8 & 12
-     $\colcrs$ & 1 & 3 & 0 &  2 & 0 & 2 & 1 &  2 & 1 & 2 &  0 &  3
-    $\tubecrs$ & 0 & 0 & 1 &  1 & 0 & 0 & 1 &  1 & 0 & 0 &  1 &  1
-     $\valcrs$ & 1 & 2 & 7 &  8 & 3 & 4 & 9 & 10 & 5 & 6 & 11 & 12
-  */
-  
-  DEBUG("\n");
-  
-  for (r = 1; r < rn; ++r) {
-    r0    = r-1;
-    i     = r0 % n;
-    start = R[r0];
-    end   = R[r];
-    
-    CACHE_ACCESS(cache, &R[r0], cache_operation::read, "R[r=%d]", r0);
-    CACHE_ACCESS(cache, &R[r],  cache_operation::read, "R[r=%d]", r);
-    
-    DEBUG("start=%d, end=%d\n", start, end);
-    
-    for (k = start; k < end; ++k) {
-      c = C[k];
-      t = T[k]; // row
-      j = t;
-      
-      DEBUG("i=%d, j=%d, t=%d, r=%d, c=%d, k=%d\n", i, j, t, r, c, k);
-      
-      CACHE_ACCESS(cache, &C[k], cache_operation::read, "C[k=%d]", k);
-      CACHE_ACCESS(cache, &T[k], cache_operation::read, "T[k=%d]", k);
-      
-      // trace("(M[i=%2d][j=%2d]=%2.0f += (p[c=%2d]=%2d * V[k=%2d]=%2.0f)=%2.0f))=%2.0f\n", i, j, M[i][j], c, p[c], k, V[k], p[c] * V[k], M[i][j] + p[c] * V[k]);
-      
-      M[i][j] += p[c] * V[k];
-      
-      CACHE_ACCESS(cache, &V[k],    cache_operation::read,  "V[k=%d]", k);
-      CACHE_ACCESS(cache, &p[c],    cache_operation::read,  "P[c=%d]", c);
-      CACHE_ACCESS(cache, &M[i][j], cache_operation::read,  "M[i=%d][j=%d]", i, j);
-      CACHE_ACCESS(cache, &M[i][j], cache_operation::write, "M[i=%d][j=%d]", i, j);
-      
-      CACHE_DEBUG(cache);
-    }
-  }
-}
+typedef struct {
+  uint           done;
+  matrix_t       *matrix;
+  vector_t const *vector;
+  tensor_t const *tensor;
+} product_thread_data_t;
 
-void
-n_mode_product_compressed(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+int
+traditional_next_tube(product_thread_data_t *data)
 {
-  debug("n_mode_product_compressed(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
+  uint k;
   
-  switch (tensor->orientation) {
-  case orientation::row:
-    compressed_row(matrix, vector, tensor);
-    break;
-  case orientation::tube:
-    compressed_tube(matrix, vector, tensor);
-    break;
-  default:
-    die("Tensor product for '%s' orientation is not currently supported.\n",
-	orientation_to_string(tensor->orientation));
-    break;
-  }
+  thread_mutex_lock(&tube_lock);
+  k = data->done++;
+  thread_mutex_unlock(&tube_lock);
+  return k < (data->tensor->n*data->tensor->n) ? k : -1;
 }
 
-typedef void (*index_convert_t)(uint rr, uint kk, uint n, uint *i, uint *j, uint *t);
-
-void
-converter_for_lateral(uint rr, uint kk, uint n, uint *i, uint *j, uint *t)
+thread_address_t
+traditional_fiber_product(thread_argument_t *argument)
 {
-  *i = kk / n;
-  *j = rr;
-  *t = kk % n;
-}
-
-void
-converter_for_horizontal(uint rr, uint kk, uint n, uint *i, uint *j, uint *t)
-{
-  *i = rr;
-  *j = kk / n;
-  *t = kk % n;
-}
-
-void
-converter_for_frontal(uint rr, uint kk, uint n, uint *i, uint *j, uint *t)
-{
-  *i = kk / n;
-  *j = kk % n;
-  *t = rr;
-}
-
-void
-compressed_slice(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor, index_convert_t converter)
-{
-  uint                       i, j, k, kk;
-  uint                       rn, nnz;
-  uint                       start, end;
-  uint                       r, rr, r0, t, m, n;
-  double                     **M;
-  double const               *V;
-  uint const                 *p, *R, *K;
-  tensor_storage_compressed_t const *storage;
-  
-  debug("compressed_slice(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
-  
-  p       = vector->data;
-  M       = matrix->data;
-  V       = tensor->values;
-  nnz     = tensor->nnz;
-  m       = matrix->m;
-  n       = matrix->n;
-  
-  storage = STORAGE_COMPRESSED(tensor);
-  rn      = storage->rn;
-  R       = storage->RO;
-  K       = storage->KO;
-  
-  /*
-    Using \emph{compressed row storage} ($\CRS$), this tensor can be
-    represented as:
-    
-           $k$   0   1   2    3   4   5   6    7   8   9   10   11
-     $\rowcrs$ & 0 & 4 & 8 & 12
-     $\colcrs$ & 1 & 3 & 0 &  2 & 0 & 2 & 1 &  2 & 1 & 2 &  0 &  3
-    $\tubecrs$ & 0 & 0 & 1 &  1 & 0 & 0 & 1 &  1 & 0 & 0 &  1 &  1
-     $\valcrs$ & 1 & 2 & 7 &  8 & 3 & 4 & 9 & 10 & 5 & 6 & 11 & 12
-  */
-  
-  DEBUG("\n");
-  
-  for (r = 1; r < rn; ++r) {
-    r0    = r-1;
-    rr    = r0;
-    start = R[r0];
-    end   = R[r];
-    
-    CACHE_ACCESS(cache, &R[r0], cache_operation::read, "R[r=%d]", r0);
-    CACHE_ACCESS(cache, &R[r],  cache_operation::read, "R[r=%d]", r);
-    
-    DEBUG("start=%d, end=%d\n", start, end);
-    
-    for (k = start; k < end; ++k) {
-      kk = K[k];
-      
-      converter(rr, kk, n, &i, &j, &t);
-      DEBUG("i=%d, j=%d, t=%d, r=%d, k=%d\n", i, j, t, r, k);
-      
-      CACHE_ACCESS(cache, &K[k], cache_operation::read, "K[k=%d]", k);
-      
-      // trace("(M[i=%2d][j=%2d]=%2.0f += (p[t=%2d]=%2d * V[k=%2d]=%2.0f)=%2.0f))=%2.0f\n", i, j, M[i][j], t, p[t], k, V[k], p[t] * V[k], M[i][j] + p[t] * V[k]);
-      
-      M[i][j] += p[t] * V[k];
-      
-      CACHE_ACCESS(cache, &V[k],    cache_operation::read,  "V[k=%d]", k);
-      CACHE_ACCESS(cache, &p[t],    cache_operation::read,  "P[t=%d]", t);
-      CACHE_ACCESS(cache, &M[i][j], cache_operation::read,  "M[i=%d][j=%d]", i, j);
-      CACHE_ACCESS(cache, &M[i][j], cache_operation::write, "M[i=%d][j=%d]", i, j);
-      
-      CACHE_DEBUG(cache);
+  int                   t;
+  uint                  i, j, k, offset;
+  uint                  n, sum;
+  uint                  *P;
+  double                **M, *T;
+  product_thread_data_t *data;
+  
+  data = (product_thread_data_t*) thread_data(argument);
+  
+  n = data->tensor->n;
+  M = data->matrix->data;
+  P = data->vector->data;
+  T = data->tensor->values;
+  
+  while (-1 != (t = traditional_next_tube(data))) {
+    sum    = 0;
+    offset = t*n;
+    i      = t/n;
+    j      = t%n;
+    for (k = 0; k < n; ++k) {
+      sum += P[k] * T[offset+k];
     }
+    M[i][j] = sum;
   }
+  
+  return NULL;
 }
 
 void
-n_mode_product_compressed_slice(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
-  index_convert_t converter;
+  product_thread_data_t data;
   
-  debug("n_mode_product_compressed_slice(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
+  memory_stride = memory_stride > tensor->n ? tensor->n : memory_stride;
+  thread_count  = thread_count > tensor->n ? tensor->n : thread_count;
   
-  converter = NULL;
+  debug("threaded_n_mode_product_array: memory_stride=%d\n", memory_stride);
+  debug("threaded_n_mode_product_array: thread_count=%d\n", thread_count);
   
-  switch (tensor->orientation) {
-  case orientation::horizontal:
-    converter = &converter_for_horizontal;
-    break;
-  case orientation::lateral:
-    converter = &converter_for_lateral;
-    break;
-  case orientation::frontal:
-    converter = &converter_for_frontal;
-    break;
-  default:
-    die("Tensor product for '%s' orientation is not currently supported.\n",
-	orientation_to_string(tensor->orientation));
-    break;
-  }
+  data.done   = 0;
+  data.matrix = matrix;
+  data.vector = vector;
+  data.tensor = tensor;
   
-  compressed_slice(matrix, vector, tensor, converter);
+  thread_mutex_init(&tube_lock);
+  thread_fork(thread_count, traditional_fiber_product, &data, NULL);
+  thread_mutex_destroy(&tube_lock);
 }
-
+ 
 void
-ekmr_row(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+serial_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
-  uint                 i, j, k;
-  uint                 rn, nnz;
-  uint                 start, end;
-  uint                 c, ck, r, r0, t, m, n;
-  double               **M;
-  double const         *V;
-  uint const           *p, *R, *CK;
-  tensor_storage_extended_t const *storage;
-  
-  debug("ekmr_row(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
-  
-  p       = vector->data;
-  M       = matrix->data;
-  V       = tensor->values;
-  nnz     = tensor->nnz;
-  m       = matrix->m;
-  n       = matrix->n;
-  
-  storage = STORAGE_EXTENDED(tensor);
-  rn      = storage->rn;
-  R       = storage->RO;
-  CK      = storage->CK;
-  
-  /* 
-     Now, using \emph{extended compressed row storage} ($\ECRS$), the
-     original tensor can be represented as:
-     
-           $k$   0   1   2    3   4   5   6    7   8   9   10   11
-     $\rowcrs$ & 0 & 4 & 8 & 12
-      $\ctcrs$ & 1 & 2 & 5 &  6 & 0 & 3 & 4 &  5 &  1 & 2 & 4 &  7
-     $\valcrs$ & 7 & 1 & 8 &  2 & 3 & 9 & 4 & 10 & 11 & 5 & 6 & 12
-  */
-  
-  DEBUG("\n");
-  
-  for (r = 1; r < rn; ++r) {
-    r0    = r-1;
-    i     = r0 % n;
-    start = R[r0];
-    end   = R[r];
-    
-    CACHE_ACCESS(cache, &R[r0], cache_operation::read, "R[r=%d]", r0);
-    CACHE_ACCESS(cache, &R[r],  cache_operation::read, "R[r=%d]", r);
-    
-    DEBUG("start=%d, end=%d\n", start, end);
-    
-    for (k = start; k < end; ++k) {
-      ck = CK[k];
-      c  = ck / n;
-      j  = c;
-      t  = ck % n;
-      
-      DEBUG("i=%d, j=%d, t=%d, r=%d, c=%d, k=%d\n", i, j, t, r, c, k);
-      
-      CACHE_ACCESS(cache, &CK[k], cache_operation::read, "CK[k=%d]", k);
-      
-      // trace("(M[i=%2d][j=%2d]=%2.0f += (p[t=%2d]=%2d * V[k=%2d]=%2.0f)=%2.0f))=%2.0f\n", i, j, M[i][j], t, p[t], k, V[k], p[t] * V[k], M[i][j] + p[t] * V[k]);
-      
-      M[i][j] += p[t] * V[k];
-      
-      CACHE_ACCESS(cache, &V[k],    cache_operation::read,  "V[k=%d]", k);
-      CACHE_ACCESS(cache, &p[t],    cache_operation::read,  "P[t=%d]", t);
-      CACHE_ACCESS(cache, &M[i][j], cache_operation::read,  "M[i=%d][j=%d]", i, j);
-      CACHE_ACCESS(cache, &M[i][j], cache_operation::write, "M[i=%d][j=%d]", i, j);
-      
-      CACHE_DEBUG(cache);
+  uint   i, j, k, index;
+  uint   n;
+  uint   *P;
+  double **M, *T;
+  
+  debug("n_mode_product_array(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
+  
+  n = tensor->n;
+  M = matrix->data;
+  P = vector->data;
+  T = tensor->values;
+  
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n; ++j) {
+      for (k = 0; k < n; ++k) {
+	index = tensor_index(tensor, i, j, k);
+	M[i][j] += P[k] * T[index];
+      }
     }
   }
 }
 
 void
-n_mode_product_ekmr(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
-  debug("n_mode_product_ekmr(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
+  debug("threaded_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
+  
+  compatible(vector, tensor);
   
-  switch (tensor->orientation) {
-  case orientation::row:
-    ekmr_row(matrix, vector, tensor);
+  switch (tensor->strategy) {
+  case strategy::array:
+    threaded_n_mode_product_array(matrix, vector, tensor);
     break;
   default:
-    die("Tensor product for '%s' orientation is not currently supported.\n",
-	orientation_to_string(tensor->orientation));
+    die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n",
+	strategy_to_string(tensor->strategy));
     break;
   }
 }
 
-extern void
-n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor);
-
 void
 serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
@@ -424,30 +153,15 @@ serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *
   
   switch (tensor->strategy) {
   case strategy::array:
-    n_mode_product_array(matrix, vector, tensor);
-    break;
-  case strategy::compressed:
-    n_mode_product_compressed(matrix, vector, tensor);
-    break;
-  case strategy::slice:
-    n_mode_product_compressed_slice(matrix, vector, tensor);
-    break;
-  case strategy::ekmr:
-  case strategy::zzekmr:  /* NOTE: the encoding may differ, but the
-			     way we calculate products remains the
-			     same.  How is that for simplicity? */
-    n_mode_product_ekmr(matrix, vector, tensor);
+    serial_n_mode_product_array(matrix, vector, tensor);
     break;
   default:
-    die("Tensor product for '%s' strategy is not currently supported.\n",
+    die("serial_n_mode_product: tensor product for '%s' strategy is not currently supported.\n",
 	strategy_to_string(tensor->strategy));
     break;
   }
 }
 
-extern void 
-threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor);
-
 void
 operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc
deleted file mode 100644
index 181696f..0000000
--- a/src/operation_threaded_n_mode_product.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-
-#include "cache.h"
-#include "compatible.h"
-#include "error.h"
-#include "matrix.h"
-#include "operation.h"
-#include "tensor.h"
-#include "thread.h"
-#include "utility.h"
-#include "vector.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-extern cache_t *cache;
-extern uint    memory_stride;
-extern uint    thread_count;
-
-/*
-  Computing ($pT$):
-  Let $\T \in R^{n\times n\times n}$ be a tensor.
-  Let $\M \in R^{n\times n}$ be a matrix.
-  Let $p \in R^{n}$ be a vector.
-  for i = 1 to l do
-    for j = 1 to m do 
-      for k = 1 to m do
-        M[i][j] += p[k] * T[i][j][k]
-      end for
-    end for
-  end for
-*/
-
-typedef struct {
-  uint           done;
-  matrix_t       *matrix;
-  vector_t const *vector;
-  tensor_t const *tensor;
-} product_thread_data_t;
-
-static pthread_mutex_t tube_lock;
-
-int
-serial_next_tube(product_thread_data_t *data)
-{
-  uint k;
-  
-  thread_mutex_lock(&tube_lock);
-  k = data->done++;
-  thread_mutex_unlock(&tube_lock);
-  return k < (data->tensor->n*data->tensor->n) ? k : -1;
-}
-
-thread_address_t
-serial_fiber_product(thread_argument_t *argument)
-{
-  int                   t;
-  uint                  i, j, k, offset;
-  uint                  n, sum;
-  uint                  *P;
-  double                **M, *T;
-  product_thread_data_t *data;
-  
-  data = (product_thread_data_t*) thread_data(argument);
-  
-  n = data->tensor->n;
-  M = data->matrix->data;
-  P = data->vector->data;
-  T = data->tensor->values;
-  
-  while (-1 != (t = serial_next_tube(data))) {
-    sum    = 0;
-    offset = t*n;
-    i      = t/n;
-    j      = t%n;
-    for (k = 0; k < n; ++k) {
-      sum += P[k] * T[offset+k];
-    }
-    M[i][j] = sum;
-  }
-  
-  return NULL;
-}
-
-void
-threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
-{
-  product_thread_data_t data;
-  
-  memory_stride = memory_stride > tensor->n ? tensor->n : memory_stride;
-  thread_count  = thread_count > tensor->n ? tensor->n : thread_count;
-  
-  debug("threaded_n_mode_product_array: memory_stride=%d\n", memory_stride);
-  debug("threaded_n_mode_product_array: thread_count=%d\n", thread_count);
-  
-  data.done   = 0;
-  data.matrix = matrix;
-  data.vector = vector;
-  data.tensor = tensor;
-  
-  thread_mutex_init(&tube_lock);
-  thread_fork(thread_count, serial_fiber_product, &data, NULL);
-  thread_mutex_destroy(&tube_lock);
-}
-
-#if 0
-void
-threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
-{
-  uint                  i;
-  pthread_t             threads[32];
-  int                   error;
-  product_thread_data_t data;
-  
-  data.done   = 0;
-  data.matrix = matrix;
-  data.vector = vector;
-  data.tensor = tensor;
-  
-  pthread_mutex_init(&tube_lock, NULL);
-  
-  for (i = 0; i < thread_count; ++i) {
-    if (0 != (error = pthread_create(&threads[i], NULL, (void* (*)(void*))serial_fiber_product, &data))) {
-      die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error);
-    }
-  }
-  
-  for (i = 0; i < thread_count; ++i) {
-    if (0 != (error = pthread_join(threads[i], NULL))) {
-      die("threaded_n_mode_product_array: pthread_join() failed with %d\n", error);
-    }
-  }
-  
-  pthread_mutex_destroy(&tube_lock);
-}
-#endif
-
-void
-n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
-{
-  uint   i, j, k, index;
-  uint   n;
-  uint   *P;
-  double **M, *T;
-  
-  debug("n_mode_product_array(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
-  
-  n = tensor->n;
-  M = matrix->data;
-  P = vector->data;
-  T = tensor->values;
-  
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n; ++j) {
-      for (k = 0; k < n; ++k) {
-	index = tensor_index(tensor, i, j, k);
-	M[i][j] += P[k] * T[index];
-      }
-    }
-  }
-}
-
-void
-threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
-{
-  debug("threaded_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
-  
-  compatible(vector, tensor);
-  
-  switch (tensor->strategy) {
-  case strategy::array:
-    threaded_n_mode_product_array(matrix, vector, tensor);
-    break;
-  default:
-    die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n",
-	strategy_to_string(tensor->strategy));
-    break;
-  }
-}
diff --git a/src/tensor.h b/src/tensor.h
index 378eb54..57af6d8 100644
--- a/src/tensor.h
+++ b/src/tensor.h
@@ -7,14 +7,6 @@
 #include "vector.h"
 #include <stdio.h>
 
-namespace permutation_heuristic {
-  typedef enum {
-    none,
-    naive_minimum,
-    naive_median
-  } type_t;
-}
-
 namespace file_format {
   typedef enum {
     unknown,
@@ -49,11 +41,10 @@ namespace orientation {
 }
 
 typedef struct {
-  uint                l, m, n, nnz;
+  uint                l, m, n;
   strategy::type_t    strategy;
   orientation::type_t orientation;
   ownership::type_t   owner;
-  void                *storage;
   double              *values;
 } tensor_t;
 
@@ -62,53 +53,8 @@ typedef struct {
   uint index;
 } coordinate_tuple_t;
 
-typedef int  (*index_compare_t)(const void *a, const void *b);
-typedef uint (*index_encoder_t)(coordinate_tuple_t const *tuple);
-typedef void (*index_copy_t)(void *destination, void const *source, uint i);
-
-typedef struct {
-  index_compare_t index_compare;
-  index_encoder_t index_r_encoder;
-  index_encoder_t index_c_encoder;
-  index_copy_t    index_copy;
-} conversion_callbacks_t;
-
-typedef struct {
-  conversion_callbacks_t *callbacks;
-} tensor_storage_base_t;
-
-typedef struct {
-  tensor_storage_base_t dummy;
-  coordinate_tuple_t    *tuples;
-} tensor_storage_coordinate_t;
-
-typedef struct {
-  tensor_storage_base_t base;
-  uint                  n, rn, cn, tn, kn;
-  uint                  *RO, *CO, *TO, *KO;
-} tensor_storage_compressed_t;
-
-typedef struct {
-  tensor_storage_base_t base;
-  uint                  rn, ckn;
-  uint                  *RO, *CK;
-} tensor_storage_extended_t;
-
-typedef struct {
-  tensor_storage_extended_t dummy;
-} tensor_storage_ekmr_t;
-
-typedef struct {
-  tensor_storage_extended_t dummy;
-} tensor_storage_zzekmr_t;
-
-#define STORAGE_BASE(x) ((tensor_storage_base_t*)x->storage)
-#define STORAGE_COORIDINATE(x) ((tensor_storage_coordinate_t*)x->storage)
-#define STORAGE_COMPRESSED(x) ((tensor_storage_compressed_t*)x->storage)
-#define STORAGE_EXTENDED(x) ((tensor_storage_extended_t*)x->storage)
-
 tensor_t* tensor_malloc(uint l, uint m, uint n, ownership::type_t owner = ownership::creator);
-tensor_t* tensor_malloc(uint l, uint m, uint n, uint nnz, strategy::type_t strategy, 
+tensor_t* tensor_malloc(uint l, uint m, uint n, strategy::type_t strategy, 
 			orientation::type_t orientation = orientation::unknown,
 			ownership::type_t owner = ownership::creator);
 tensor_t* tensor_malloc_from_template(tensor_t const *tensor);
@@ -121,10 +67,10 @@ void tensor_transfer_ownership(tensor_t *destination, tensor_t *source);
 
 void tensor_clear(tensor_t *tensor);
 
+#if 0
 tensor_t *tensor_convert(tensor_t *tensor, strategy::type_t strategy, orientation::type_t orientation = orientation::unknown);
 void tensor_convert(tensor_t *destination, tensor_t *source);
-
-tensor_t* tensor_permute(tensor_t *tensor, permutation_heuristic::type_t heuristic);
+#endif
 
 tensor_t *tensor_read(char const *filename);
 tensor_t *tensor_fread(FILE *file);
@@ -139,18 +85,14 @@ void tensor_validate(tensor_t const *tensor);
 char const* strategy_to_string(strategy::type_t strategy);
 char const* orientation_to_string(orientation::type_t orientation);
 char const* orientation_to_latex_macro(orientation::type_t orientation);
-char const* permutation_heuristic_to_string(permutation_heuristic::type_t heuristic);
 strategy::type_t string_to_strategy(char const *name);
 orientation::type_t string_to_orientation(char const *name);
-permutation_heuristic::type_t string_to_permutation_heuristic(char const *name);
 strategy::type_t typecode_to_strategy(MM_typecode type);
 void strategy_to_typecode(MM_typecode *type, strategy::type_t strategy);
 void print_strategies(char const *format);
 void print_orientations(char const *format);
 void print_operations(char const *format);
 void print_operations_with_descriptions(char const *format);
-void print_permutation_heuristics(char const *format);
-void print_permutation_heuristics_with_descriptions(char const *format);
 
 #if 0
 void tensor_add(tensor_t *t1, tensor_t const *t2);
diff --git a/src/tensor_clear.cc b/src/tensor_clear.cc
index fc23665..e76d23b 100644
--- a/src/tensor_clear.cc
+++ b/src/tensor_clear.cc
@@ -2,90 +2,18 @@
 #include "error.h"
 #include "tensor.h"
 
-void
-tensor_storage_clear_coordinate(tensor_t *tensor)
-{
-  uint i;
-  tensor_storage_coordinate_t *storage;
-  coordinate_tuple_t   *tuples;
-  
-  debug("tensor_storage_clear_coordinate(0x%x)\n", tensor);
-  
-  storage = STORAGE_COORIDINATE(tensor);
-  tuples  = storage->tuples;
-  
-  for (i = 0; i < tensor->nnz; ++i) {
-    tuples[i].i     = 0;
-    tuples[i].j     = 0;
-    tuples[i].k     = 0;
-    tuples[i].index = 0;
-  }
-}
-
-void
-tensor_storage_clear_compressed(tensor_t *tensor)
-{
-  uint i;
-  tensor_storage_compressed_t *storage;
-  
-  debug("tensor_storage_clear_compressed(0x%x)\n", tensor);
-  
-  storage = STORAGE_COMPRESSED(tensor);
-  
-  for (i = 0; i < storage->rn; ++i) {
-    storage->RO[i] = 0;
-  }
-  
-  for (i = 0; i < tensor->nnz; ++i) {
-    storage->CO[i] = 0;
-    storage->KO[i] = 0;
-  }
-}
-
-void
-tensor_storage_clear_extended(tensor_t *tensor)
-{
-  uint i;
-  tensor_storage_extended_t *storage;
-  
-  debug("tensor_storage_clear_ekmr(0x%x)\n", tensor);
-  
-  storage = STORAGE_EXTENDED(tensor);
-  
-  for (i = 0; i < storage->rn; ++i) {
-    storage->RO[i] = 0;
-  }
-  
-  for (i = 0; i < tensor->nnz; ++i) {
-    storage->CK[i] = 0;
-  }
-}
-
 void
 tensor_clear(tensor_t *tensor)
 {
-  uint i;
+  uint i, n;
   
   debug("tensor_clear(0x%x)\n", tensor);
   tensor_validate(tensor);
   
-  for (i = 0; i < tensor->nnz; ++i) {
-    tensor->values[i] = 0.0;
-  }
+  n = tensor->l*tensor->m*tensor->n;
   
-  switch (tensor->strategy) {
-  case strategy::coordinate:
-    tensor_storage_clear_coordinate(tensor);
-    break;
-  case strategy::compressed:
-    tensor_storage_clear_compressed(tensor);
-    break;
-  case strategy::ekmr:
-  case strategy::zzekmr:
-    tensor_storage_clear_extended(tensor);
-    break;
-  default:
-    die("Tensor storage strategy '%d' is not supported.\n", tensor->strategy);
+  for (i = 0; i < n; ++i) {
+    tensor->values[i] = 0.0;
   }
 }
 
diff --git a/src/tensor_copy.cc b/src/tensor_copy.cc
index 267ec71..1d2c9eb 100644
--- a/src/tensor_copy.cc
+++ b/src/tensor_copy.cc
@@ -9,7 +9,6 @@ tensor_copy_shallow(tensor_t *destination, tensor_t *source)
   
   destination->owner   = ownership::viewer;
   destination->values  = source->values;
-  destination->storage = source->storage;
 }
 
 tensor_t*
@@ -19,7 +18,7 @@ tensor_copy_shallow(tensor_t *source)
   
   debug("tensor_copy_shallow(source=0x%x)\n", source);
   
-  destination = tensor_malloc(source->l, source->m, source->n, source->nnz, source->strategy, source->orientation, source->owner);
+  destination = tensor_malloc(source->l, source->m, source->n, source->strategy, source->orientation, source->owner);
   tensor_copy_shallow(destination, source);
   
   return destination;
diff --git a/src/tensor_emit_latex.cc b/src/tensor_emit_latex.cc
index d3ae185..3150066 100644
--- a/src/tensor_emit_latex.cc
+++ b/src/tensor_emit_latex.cc
@@ -46,47 +46,7 @@ print_footer(FILE *file)
   fprintf(file, "\\end{tabular}\n");
 }
 
-void
-tensor_fwrite_compressed_latex(FILE *file, tensor_t const *tensor)
-{
-  uint                        l, m, n;
-  int                         nnz;
-  tensor_storage_compressed_t *storage;
-  char const                  *name, *macro;
-  
-  debug("tensor_fwrite_compressed_latex(file=0x%x, tensor=0x%x)\n", file, tensor);
-  
-  storage = STORAGE_COMPRESSED(tensor);
-  l       = tensor->l;
-  m       = tensor->m;
-  n       = tensor->n;
-  nnz     = tensor->nnz;
-  name    = orientation_to_string(tensor->orientation);
-  macro   = orientation_to_latex_macro(tensor->orientation);
-  
-  debug("tensor_fwrite_compressed_latex: l=%d, m=%d, n=%d, nnz=%d, orientation='%s', macro='%s'.\n", 
-	l, m, n, nnz, name, macro);
-  
-  print_header(file, nnz);
-  print_hline(file, storage->rn);
-  fprintf(file, "$\\row_{\\%s}$ & ", macro);
-  for_each_fprintf(file, "%d%s", storage->RO, storage->rn, " & ", " \\\\\n");
-  print_hline(file, storage->cn);
-  fprintf(file, "$\\col_{\\%s}$ & ", macro);
-  for_each_fprintf(file, "%d%s", storage->CO, storage->cn,  " & ", " \\\\\n");  
-  print_hline(file, storage->tn);
-  fprintf(file, "$\\tube_{\\%s}$ & ", macro);
-  for_each_fprintf(file, "%d%s", storage->TO, storage->tn,  " & ", " \\\\\n");  
-  print_hline(file, storage->kn);
-  fprintf(file, "$KO_{\\%s}$ & ", macro);
-  for_each_fprintf(file, "%d%s", storage->KO, storage->kn,  " & ", " \\\\\n");
-  print_hline(file, nnz);
-  fprintf(file, "$\\val_{\\%s}$ & ", macro);
-  for_each_fprintf(file, "%g%s", tensor->values, nnz,  " & ", " \\\\\n");
-  print_hline(file, nnz);
-  print_footer(file);
-}
-
+#if 0
 void
 tensor_fwrite_extended_compressed_latex(FILE *file, tensor_t const *tensor, strategy::type_t strategy)
 {
@@ -121,25 +81,12 @@ tensor_fwrite_extended_compressed_latex(FILE *file, tensor_t const *tensor, stra
   print_hline(file, nnz);
   print_footer(file);
 }
+#endif
 
 void
 tensor_emit_latex(FILE *file, tensor_t const *tensor)
 {
   debug("tensor_emit_latex(file=0x%x, tensor=0x%x)\n", file, tensor);
   debug("tensor_emit_latex: strategy='%s'\n", strategy_to_string(tensor->strategy));
-  
-  switch (tensor->strategy) {
-  case strategy::compressed:
-  case strategy::slice:
-    tensor_fwrite_compressed_latex(file, tensor);
-    break;
-  case strategy::ekmr:
-  case strategy::zzekmr:
-    tensor_fwrite_extended_compressed_latex(file, tensor, tensor->strategy);
-    break;
-  default:
-    die("Emitting LaTeX source for storage strategy '%d' is not supported.\n", 
-	strategy_to_string(tensor->strategy));
-  }
 }
 
diff --git a/src/tensor_free.cc b/src/tensor_free.cc
index 58cdd70..696a2ad 100644
--- a/src/tensor_free.cc
+++ b/src/tensor_free.cc
@@ -6,73 +6,6 @@
 #include <stdio.h>
 #include <assert.h>
 
-void
-tensor_storage_free(tensor_storage_base_t *storage)
-{
-  superfluous("tensor_storage_free((tensor_storage_base_t*)0x%x)\n", storage);
-  
-  safe_free(storage->callbacks);
-}
-
-void
-tensor_storage_free(tensor_storage_coordinate_t *storage)
-{
-  superfluous("tensor_storage_free((tensor_storage_coordinate_t*)0x%x)\n", storage);
-  
-  safe_free(storage->tuples);
-}
-
-void
-tensor_storage_free(tensor_storage_compressed_t *storage)
-{
-  superfluous("tensor_storage_free((tensor_storage_compressed_t*)0x%x)\n", storage);
-  
-  safe_free(storage->RO);
-  safe_free(storage->CO);
-  safe_free(storage->TO);
-  safe_free(storage->KO);
-}
-
-void
-tensor_storage_free(tensor_storage_extended_t *storage)
-{
-  superfluous("tensor_storage_free((tensor_storage_extended_t*)0x%x)\n", storage);
-  
-  safe_free(storage->RO);
-  safe_free(storage->CK);
-}
-
-void
-tensor_storage_free(tensor_t *tensor)
-{
-  superfluous("tensor_storage_free(0x%x)\n", tensor);
-  
-  if (!tensor->storage) {
-    return;
-  }
-  
-  tensor_storage_free(STORAGE_BASE(tensor));
-  
-  switch (tensor->strategy) {
-  case strategy::coordinate:
-    tensor_storage_free(STORAGE_COORIDINATE(tensor));
-    break;
-  case strategy::compressed:
-  case strategy::slice:
-    tensor_storage_free(STORAGE_COMPRESSED(tensor));
-    break;
-  case strategy::ekmr:
-  case strategy::zzekmr:
-    tensor_storage_free(STORAGE_EXTENDED(tensor));
-    break;
-  default:
-    die("Tensor storage strategy '%d' is not supported.\n", 
-	strategy_to_string(tensor->strategy));
-  }
-  
-  safe_free(tensor->storage);
-}
-
 void
 tensor_free(tensor_t *tensor)
 {
@@ -84,7 +17,6 @@ tensor_free(tensor_t *tensor)
   
   if (ownership::creator == tensor->owner) {
     safe_free(tensor->values);
-    tensor_storage_free(tensor);
   }
   
   safe_free(tensor);
diff --git a/src/tensor_malloc.cc b/src/tensor_malloc.cc
index ddce7ce..42f5be1 100644
--- a/src/tensor_malloc.cc
+++ b/src/tensor_malloc.cc
@@ -2,7 +2,6 @@
 #include "error.h"
 #include "memory.h"
 #include "mmio.h"
-#include "storage.h"
 #include "tensor.h"
 #include "utility.h"
 #include <stdio.h>
@@ -24,12 +23,10 @@ tensor_malloc(uint l, uint m, uint n, ownership::type_t owner)
   tensor->l           = l;
   tensor->m           = m;
   tensor->n           = n;
-  tensor->nnz         = 0;
   tensor->strategy    = strategy::array;
   tensor->orientation = orientation::unknown;
   tensor->owner       = owner;
   tensor->values      = NULL;
-  tensor->storage     = NULL;
   
   if (ownership::viewer == owner) {
     return tensor;
@@ -44,35 +41,29 @@ tensor_malloc(uint l, uint m, uint n, ownership::type_t owner)
 }
 
 tensor_t*
-tensor_malloc(uint l, uint m, uint n, uint nnz, strategy::type_t strategy, orientation::type_t orientation, ownership::type_t owner)
+tensor_malloc(uint l, uint m, uint n, strategy::type_t strategy, orientation::type_t orientation, ownership::type_t owner)
 {
   tensor_t *tensor;
   
-  superfluous("tensor_malloc(l=%d, m=%d, n=%d, nnz=%d, strategy='%s', orientation='%s')\n",
-	l, m, n, nnz, strategy_to_string(strategy), orientation_to_string(orientation));
+  superfluous("tensor_malloc(l=%d, m=%d, n=%d, strategy='%s', orientation='%s')\n",
+	l, m, n, strategy_to_string(strategy), orientation_to_string(orientation));
   
   tensor              = MALLOC(tensor_t);
   tensor->l           = l;
   tensor->m           = m;
   tensor->n           = n;
-  tensor->nnz         = nnz;
   tensor->strategy    = strategy;
   tensor->orientation = orientation;
   tensor->owner       = owner;
   tensor->values      = NULL;
-  tensor->storage     = NULL;
   
   if (ownership::viewer == owner) {
     return tensor;
   }
   
-  if (nnz > 0) {
-    tensor->values  = MALLOC_N(double, nnz);
-    tensor->storage = tensor_storage_malloc(tensor);
-  }
+  tensor->values      = MALLOC_N(double, l*m*n);
   
   superfluous("tensor_malloc: tensor->values=0x%x\n", tensor->values);
-  superfluous("tensor_malloc: tensor->storage=0x%x\n", tensor->storage);
   superfluous("tensor_malloc: tensor=0x%x\n", tensor);
 
   return tensor;
@@ -83,5 +74,5 @@ tensor_malloc_from_template(tensor_t const *tensor)
 {
   superfluous("tensor_malloc_from_template(tensor=0x%x)\n", tensor);
   
-  return tensor_malloc(tensor->l, tensor->m, tensor->n, tensor->nnz, tensor->strategy, tensor->orientation, tensor->owner);
+  return tensor_malloc(tensor->l, tensor->m, tensor->n, tensor->strategy, tensor->orientation, tensor->owner);
 }
diff --git a/src/tensor_read.cc b/src/tensor_read.cc
index 0fcb404..775b946 100644
--- a/src/tensor_read.cc
+++ b/src/tensor_read.cc
@@ -43,6 +43,7 @@ tensor_fread_array(FILE *file)
   return tensor;
 }
 
+#if 0
 tensor_t*
 tensor_fread_coordinate(FILE *file)
 {
@@ -213,6 +214,7 @@ tensor_fread_extended_compressed(FILE *file, strategy::type_t strategy)
   
   return tensor;
 }
+#endif
 
 tensor_t*
 tensor_fread_mmio_data(FILE *file, MM_typecode type)
@@ -237,6 +239,7 @@ tensor_fread_mmio_data(FILE *file, MM_typecode type)
   case strategy::array:
     tensor = tensor_fread_array(file);
     break;
+#if 0
   case strategy::coordinate:
     tensor = tensor_fread_coordinate(file);
     break;
@@ -250,6 +253,7 @@ tensor_fread_mmio_data(FILE *file, MM_typecode type)
   case strategy::zzekmr:
     tensor = tensor_fread_extended_compressed(file, strategy);
     break;
+#endif
   default:
     die("Tensor storage strategy '%d' is not supported.\n", strategy);
   }
@@ -270,6 +274,7 @@ tensor_fread_mmio(FILE *file)
   return tensor_fread_mmio_data(file, type);
 }
 
+#if 0
 tensor_t*
 tensor_fread_matlab(FILE *file)
 {
@@ -322,6 +327,7 @@ tensor_fread_matlab(FILE *file)
   
   return tensor;
 }
+#endif
 
 file_format::type_t
 detect_file_format(FILE *file)
@@ -357,9 +363,11 @@ tensor_fread_file_format(FILE *file, file_format::type_t format)
   case file_format::mmio:
     tensor = tensor_fread_mmio(file);
     break;
+#if 0
   case file_format::matlab:
     tensor = tensor_fread_matlab(file);
     break;
+#endif
   default:
     die("tensor_fread_file_format: unknown file type %d.\n", format);
     break;
diff --git a/src/tensor_utility.cc b/src/tensor_utility.cc
index 9700df2..dbd1ff1 100644
--- a/src/tensor_utility.cc
+++ b/src/tensor_utility.cc
@@ -4,59 +4,6 @@
 #include "utility.h"
 #include <string.h>
 
-static char const *map_permutation_heuristics_to_string[] = {
-  "none",
-  "naive-minimum",
-  "naive-median"
-};
-
-static char const *map_permutation_heuristics_to_description[] = { 
-  "none",
-  "re-order tensor layout based on minimum intra-slice proximity",
-  "re-order tensor layout based on median intra-slice proximity"
-};
-
-char const*
-permutation_heuristic_to_string(permutation_heuristic::type_t heuristic)
-{
-  return map_permutation_heuristics_to_string[heuristic];
-}
-
-permutation_heuristic::type_t
-string_to_permutation_heuristic(char const *name)
-{
-  uint i;
-  
-  for (i = 0; i < COUNT_OF(map_permutation_heuristics_to_string); ++i) {
-    if (0 == strcmp(name, map_permutation_heuristics_to_string[i])) {
-      return (permutation_heuristic::type_t) i;
-    }
-  }
-  
-  return permutation_heuristic::none;
-}
-
-void
-print_permutation_heuristics(char const *format)
-{
-  uint i;
-  
-  for (i = 1; i < COUNT_OF(map_permutation_heuristics_to_string); ++i) {
-    message(format, map_permutation_heuristics_to_string[i]);
-  }
-}
-
-void
-print_permutation_heuristics_with_descriptions(char const *format)
-{
-  uint i;
-  
-  for (i = 1; i < COUNT_OF(map_permutation_heuristics_to_string); ++i) {
-    message(format, map_permutation_heuristics_to_string[i], 
-	    map_permutation_heuristics_to_description[i]);
-  }
-}
-
 static char const *map_strategy_to_string[] = {
   "unknown",
   "array",
diff --git a/src/tensor_validate.cc b/src/tensor_validate.cc
index 32808ed..96f47ea 100644
--- a/src/tensor_validate.cc
+++ b/src/tensor_validate.cc
@@ -12,8 +12,4 @@ tensor_validate(tensor_t const *tensor)
   if (!tensor->values) {
     die("Tensor values have not been allocated.\n");
   }
-  
-  if (!tensor->storage) {
-    die("Tensor indexing strategy has not been allocated.\n");
-  }
 }
diff --git a/src/tensor_write.cc b/src/tensor_write.cc
index 8f69149..67700e1 100644
--- a/src/tensor_write.cc
+++ b/src/tensor_write.cc
@@ -51,6 +51,7 @@ tensor_fwrite_array(FILE *file, tensor_t const *tensor)
   }
 }
 
+#if 0
 void
 tensor_fwrite_coordinate(FILE *file, tensor_t const *tensor)
 {
@@ -230,6 +231,7 @@ tensor_fwrite_extended_compressed(FILE *file, tensor_t const *tensor)
     fprintf(file, "%d %10.6g\n", storage->CK[i], tensor->values[i]);
   }
 }
+#endif
 
 void
 tensor_fwrite_implementation(FILE *file, tensor_t const *tensor)
@@ -241,6 +243,7 @@ tensor_fwrite_implementation(FILE *file, tensor_t const *tensor)
   case strategy::array:
     tensor_fwrite_array(file, tensor);
     break;
+#if 0
   case strategy::coordinate:
     tensor_fwrite_coordinate(file, tensor);
     break;
@@ -254,6 +257,7 @@ tensor_fwrite_implementation(FILE *file, tensor_t const *tensor)
   case strategy::zzekmr:
     tensor_fwrite_extended_compressed(file, tensor);
     break;
+#endif
   default:
     die("Tensor storage strategy '%d' is not supported.\n", 
 	strategy_to_string(tensor->strategy));
diff --git a/src/tool.h b/src/tool.h
index f687070..a40c1cf 100644
--- a/src/tool.h
+++ b/src/tool.h
@@ -12,10 +12,8 @@ namespace tool {
   typedef enum {
     unknown,
     tensor,
-    convert,
     generate,
-    effectuate,
-    permute
+    effectuate
   } type_t;
 }
 
@@ -41,10 +39,8 @@ namespace tool {
 #define DEFAULT_CACHE_SIZE            (2*1024)
 #define DEFAULT_CACHE_LINE_SIZE       32
 
-void convert_tool_main(int argc, char *argv[]);
 void generate_tool_main(int argc, char *argv[]);
 void effectuate_tool_main(int argc, char *argv[]);
-void permute_tool_main(int argc, char *argv[]);
 
 vector_t* timed_vector_read(char const *name);
 matrix_t* timed_matrix_read(char const *name);

From 1b33c664052d319317af0f62f119ca81f660ea4b Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Wed, 9 Nov 2011 13:55:23 -0700
Subject: [PATCH 27/57] + Removed sparse tensor storage code

---
 src/format.h                           | 164 ----------
 src/storage.h                          |  42 ---
 src/tensor_convert.cc                  |  33 --
 src/tensor_permute.cc                  | 403 -------------------------
 src/tensor_storage_compressed.cc       | 123 --------
 src/tensor_storage_compressed_slice.cc | 122 --------
 src/tensor_storage_convert.cc          | 230 --------------
 src/tensor_storage_coordinate.cc       | 134 --------
 src/tensor_storage_ekmr.cc             | 100 ------
 src/tensor_storage_gundersen.cc        | 136 ---------
 src/tensor_storage_malloc.cc           |  44 ---
 src/tensor_storage_matrix_slice.cc     | 117 -------
 src/tensor_storage_utility.cc          | 198 ------------
 src/tensor_storage_zzekmr.cc           | 140 ---------
 src/tool_convert.cc                    | 169 -----------
 src/tool_permute.cc                    | 171 -----------
 16 files changed, 2326 deletions(-)
 delete mode 100644 src/format.h
 delete mode 100644 src/storage.h
 delete mode 100644 src/tensor_convert.cc
 delete mode 100644 src/tensor_permute.cc
 delete mode 100644 src/tensor_storage_compressed.cc
 delete mode 100644 src/tensor_storage_compressed_slice.cc
 delete mode 100644 src/tensor_storage_convert.cc
 delete mode 100644 src/tensor_storage_coordinate.cc
 delete mode 100644 src/tensor_storage_ekmr.cc
 delete mode 100644 src/tensor_storage_gundersen.cc
 delete mode 100644 src/tensor_storage_malloc.cc
 delete mode 100644 src/tensor_storage_matrix_slice.cc
 delete mode 100644 src/tensor_storage_utility.cc
 delete mode 100644 src/tensor_storage_zzekmr.cc
 delete mode 100644 src/tool_convert.cc
 delete mode 100644 src/tool_permute.cc

diff --git a/src/format.h b/src/format.h
deleted file mode 100644
index f499af8..0000000
--- a/src/format.h
+++ /dev/null
@@ -1,164 +0,0 @@
-
-#pragma once
-#ifndef _FORMAT_H_
-#define _FORMAT_H_
-
-/*--------------------------------------------------------------------*/
-
-#include "storage.h"
-#include "tensor.h"
-#include "utility.h"
-
-#include <algorithm>
-#include <iomanip>
-#include <iostream>
-#include <vector>
-
-/*--------------------------------------------------------------------*/
-
-BEGIN_NAMESPACE(storage);
-
-/*--------------------------------------------------------------------*/
-
-template<class T>
-class tensor;
-
-/*--------------------------------------------------------------------*/
-
-END_NAMESPACE(storage);
-
-/*--------------------------------------------------------------------*/
-
-BEGIN_NAMESPACE(format);
-
-/*--------------------------------------------------------------------*/
-
-const int precision = 32;
-
-/*--------------------------------------------------------------------*/
-
-BEGIN_NAMESPACE(strategy);
-
-typedef enum {
-  coordinate = 0,
-  max        = coordinate
-} type;
-
-END_NAMESPACE(strategy);
-
-/*--------------------------------------------------------------------*/
-
-template <class T>
-class coordinate {
-
-public:
-
-  std::istream&
-  read(std::istream& in, strategy::tensor<T>& data)  {
-    
-    T   v;
-    int i, j, k;
-    int n, nnz;
-    
-    /* determine the data's dimensionality */
-    in >> n >> n >> n >> nnz;
-    
-    /* make sure we will not have to realloc during the read */
-    data.initialize(n, nnz);
-    
-    /* read the data */
-    while (nnz--) {
-      in >> k >> i >> j >> v;
-      data.set(k, j, i, v);
-    }
-    
-    /* all done */
-    return in;
-    
-  }
-
-  std::ostream&
-  write(std::ostream& out, strategy::tensor<T> const &data) const {
-   
-    T   v;
-    int n, nnz;
-    
-    /* make sure we have some data */
-    if (data.empty()) {
-      std::cerr << "ERROR: no data to print!" << std::endl;
-      return out;
-    }
-
-    /* determine the data's dementionality */
-    n = data.size();
-
-    /* determine the number of non-zero entries */
-    nnz = 0;
-    for (int k = 0; k < n; ++k) {
-      for (int i = 0; i < n; ++i) {
-	for (int j = 0; j < n; ++j) {
-	  if (0.0 != data.get(k, i, j)) {
-	    nnz++;
-	  }
-	}
-      }
-    }
-    
-    /* print the dimensions so we can optionally use the output as
-       input later */
-    out << n << " " << n << " " << n << " " 
-	<< nnz << std::endl;
-    
-    /* set the output format */
-    out << std::setprecision(precision)
-	<< std::scientific;
-    
-    /* print the data */
-    for (int k = 0; k < n; ++k) {
-      for (int i = 0; i < n; ++i) {
-	for (int j = 0; j < n; ++j) {
-	  if (0.0 != (v = data.get(k, i, j))) {
-	    out << k << " " << i << " " << j << " " 
-		<< v << std::endl;
-	  }
-	}
-      }
-    }
-    
-    /* all done */
-    return out;
-    
-  }
-  
-};
-
-/*--------------------------------------------------------------------*/
-
-/* Compensate for the lack of templated typedefs.
-   Usage: 
-   sextended_karnaugh_map<float, acronyms::CSR> M;
-   ** NOT WORKING because of template template parameters **
-*/
-#if 0
-template<class T>
-struct acronyms {
-  typedef coordinate<T> COO;
-  typedef compressed_sparse_row<T> CSR;
-  typedef compressed_sparse_column<T> CSC;
-  typedef block_sparse_row<T> BSR;
-};
-#endif
-
-/*--------------------------------------------------------------------*/
-
-END_NAMESPACE(format);
-
-/*--------------------------------------------------------------------*/
-
-#endif /* _FORMAT_H_ */
-
-/*
-  Local Variables:
-  mode: C++
-  End:
-*/
diff --git a/src/storage.h b/src/storage.h
deleted file mode 100644
index c402482..0000000
--- a/src/storage.h
+++ /dev/null
@@ -1,42 +0,0 @@
-
-#ifndef _STORAGE_H_
-#define _STORAGE_H_
-
-#include "tensor.h"
-
-void* tensor_storage_malloc(tensor_t const *tensor);
-tensor_storage_coordinate_t* tensor_storage_malloc_coordinate(tensor_t const *tensor);
-tensor_storage_compressed_t* tensor_storage_malloc_compressed(tensor_t const *tensor);
-tensor_storage_compressed_t* tensor_storage_malloc_compressed_slice(tensor_t const *tensor);
-tensor_storage_extended_t* tensor_storage_malloc_ekmr(tensor_t const *tensor);
-tensor_storage_extended_t* tensor_storage_malloc_zzekmr(tensor_t const *tensor);
-
-void tensor_storage_convert(tensor_t *destination, tensor_t *source);
-void tensor_storage_convert_from_coordinate_to_compressed(tensor_t *destination, tensor_t *source);
-void tensor_storage_convert_from_coordinate_to_compressed_slice(tensor_t *destination, tensor_t *source);
-void tensor_storage_convert_from_coordinate_to_gundersen(tensor_t *destination, tensor_t *source);
-void tensor_storage_convert_from_coordinate_to_ekmr(tensor_t *destination, tensor_t *source);
-void tensor_storage_convert_from_coordinate_to_zzekmr(tensor_t *destination, tensor_t *source);
-void tensor_storage_convert_from_compressed_to_coordinate(tensor_t *destination, tensor_t *source);
-
-int index_compare_ijk(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb);
-int index_compare_jik(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb);
-int index_compare_jki(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb);
-int index_compare_kji(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb);
-int index_compare_kij(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb);
-int index_compare_ikj(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb);
-
-uint tensor_storage_index_encode(uint *indices, uint n, coordinate_tuple_t const *tuple, uint nnz, index_encoder_t encoder);
-uint encoder_for_i(coordinate_tuple_t const *tuple);
-uint encoder_for_j(coordinate_tuple_t const *tuple);
-uint encoder_for_k(coordinate_tuple_t const *tuple);
-
-void tensor_storage_copy(void *destination, void const *source, uint nnz, index_copy_t copier);
-void copier_for_i(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i);
-void copier_for_j(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i);
-void copier_for_k(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i);
-void copier_for_values(tensor_t *destination, tensor_t const *source, uint i);
-
-
-#endif
-
diff --git a/src/tensor_convert.cc b/src/tensor_convert.cc
deleted file mode 100644
index bf7294d..0000000
--- a/src/tensor_convert.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-
-#include "compatible.h"
-#include "error.h"
-#include "memory.h"
-#include "mmio.h"
-#include "storage.h"
-#include "tensor.h"
-#include "utility.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-void
-tensor_convert(tensor_t *destination, tensor_t *source)
-{
-  debug("tensor_convert(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  compatible(destination, source);
-  tensor_storage_convert(destination, source);
-}
-
-tensor_t*
-tensor_convert(tensor_t *tensor, strategy::type_t strategy, orientation::type_t orientation)
-{
-  tensor_t *result;
-  
-  debug("tensor_convert(tensor=0x%x, strategy='%s', orientation='%s')\n", 
-	tensor, strategy_to_string(strategy), orientation_to_string(orientation));
-  
-  result = tensor_malloc(tensor->l, tensor->m, tensor->n, tensor->nnz, strategy, orientation);
-  tensor_convert(result, tensor);
-  
-  return result;
-}
diff --git a/src/tensor_permute.cc b/src/tensor_permute.cc
deleted file mode 100644
index c41370b..0000000
--- a/src/tensor_permute.cc
+++ /dev/null
@@ -1,403 +0,0 @@
-
-#include "error.h"
-#include "memory.h"
-#include "matrix.h"
-#include "tensor.h"
-#include "utility.h"
-#include "vector.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-
-typedef uint (*slice_distance_t)(tensor_t *tensor, uint i, uint j);
-typedef void (*slice_permutation_t)(vector_t *vector, tensor_t *tensor, slice_distance_t distance);
-
-uint
-slice_distance(tensor_t *tensor, uint s1, uint s2)
-{
-  uint                              i, j, c1, c2, k1, k2, n;
-  uint                              distance;
-  uint const                        *R, *C, *K;
-  tensor_storage_compressed_t const *storage;
-  
-  //superfluous("slice_distance(vector=0x%x, s1=%d, s2=%d)\n", tensor, s1, s2);
-  
-  distance = 0;
-  storage  = STORAGE_COMPRESSED(tensor);
-  n        = storage->rn;
-  R        = storage->RO;
-  C        = storage->CO;
-  K        = storage->KO;
-  
-  for (i = R[s1], j = R[s2]; i < R[s1+1] && j < R[s2+1];) {
-    c1 = K[i] / n; k1 = K[i] % n;
-    c2 = K[j] / n; k2 = K[j] % n;
-    if (c1 != c2 || k1 != k2) {
-      distance++;
-      if (c1 < c2 || k1 < k2) {
-	i++;
-      } else if (c2 < c1 || k2 < k1) {
-	j++;
-      }
-    } else {
-      i++; j++;
-    }
-  }
-  for (; i < R[s1+1]; ++i) {
-    distance++;
-  }
-  for (; j < R[s2+1]; ++j) {
-    distance++;
-  }
-  
-  //DEBUG("slice_distance: distance(%d, %d)=%d\n", s1, s2, distance);
-  
-  return distance;
-}
-
-
-void
-naive_minimum_permutation(vector_t *vector, tensor_t *tensor, slice_distance_t distance)
-{
-  uint     i, j, k, p;
-  uint     n;
-  uint     best;
-  matrix_t *matrix;
-  double   **D;
-  uint     *V;
-  bool     *seen;
-  uint const                        *R, *C, *K;
-  tensor_storage_compressed_t const *storage;
-  
-  debug("naive_minimum_permutation(vector=0x%x, tensor=0x%x, distance=0x%x)\n", 
-	vector, tensor, distance);
-  
-  storage  = STORAGE_COMPRESSED(tensor);
-  R        = storage->RO;
-  C        = storage->CO;
-  K        = storage->KO;
-  
-  n        = tensor->n;
-  matrix   = matrix_malloc(n, n);
-  D        = matrix->data;
-  V        = vector->data;
-    
-  matrix_clear(matrix);
-  
-  for (i = 0; i < n; ++i) {
-    D[i][i] = n*n+1;
-  }  
-  for (j = 0; j < n; ++j) {
-    best = n*n+1;
-    for (i = j+1; i < n; ++i) {
-      if (i != j) {
-	D[i][j] = (*distance)(tensor, i, j);
-	D[j][i] = D[i][j];
-	if (best > D[i][j]) {
-	  best = D[i][j];
-	  V[0] = i;
-	  V[1] = j;
-	  DEBUG("permutation: best(%d, %d)=%d\n", i, j, best);
-	}
-      }
-    }
-  }
-  
-  DEBUG("permutation: best=%d, V[0]=%d, V[1]=%d\n", best, V[0], V[1]);
-  //matrix_fwrite(stdout, matrix, format::coordinate);
-  
-  seen = MALLOC_N(bool, n);
-  for (i = 0; i < n; ++i) {
-    seen[i] = false;
-  }
-  
-  seen[V[0]] = true;
-  seen[V[1]] = true;
-  
-  for (j = 2; j < n; ++j) {
-    best = n*n+1;
-    k    = 0;
-    p    = V[j-1];
-    for (i = 0; i < n; ++i) {
-      if (!seen[i] && i != p) {
-	DEBUG("permutation: looking-at(%d, %d)=%lf\n", i, j, D[i][j]);
-	if (best > D[p][i]) {
-	  best = D[p][i];
-	  k    = i;
-	  DEBUG("permutation: best(%d, %d)=%d\n", i, j, best);
-	}
-      }
-    }
-    V[j]       = k;
-    seen[V[j]] = true;
-    DEBUG("permutation: best=%d, V[%d]=%d, V[%d]=%d\n", best, j, V[j-1], j, V[j]);
-    DEBUG("permutation: seen=%d\n", k);
-  }
-  
-  safe_free(seen);
-  
-#if 0
-  vector_fwrite(stdout, vector)
-  vector_fwrite(stdout, mean);
-#endif
-}
-
-/*
- * The following code is public domain.
- * Algorithm by Torben Mogensen, implementation by N. Devillard.
- * This code in public domain.
- *
- * Source: http://ndevilla.free.fr/median/median/src/
- */
-uint
-non_destructive_median(double m[], uint n, uint skip) 
-{
-  uint i, less, greater, equal;
-  double min, max, guess, maxltguess, mingtguess;
-  min = max = m[0] ;
-  for (i=1 ; i<n ; i++) {
-    if (i != skip) {
-      if (m[i]<min) min=m[i];
-      if (m[i]>max) max=m[i];
-    }
-  }
-  while (1) {
-    guess = (min+max)/2;
-    less = 0; greater = 0; equal = 0; 
-    maxltguess = min ;
-    mingtguess = max ;
-    for (i=0; i<n; i++) {
-      if (i != skip) {
-	if (m[i]<guess) {
-	  less++;
-	  if (m[i]>maxltguess) maxltguess = m[i] ; } 
-	else if (m[i]>guess) {
-	  greater++;
-	  if (m[i]<mingtguess) mingtguess = m[i] ; } 
-	else equal++;
-      }
-    }
-    if (less <= (n+1)/2 && greater <= (n+1)/2) break ; 
-    else if (less>greater) max = maxltguess ;
-    else min = mingtguess;
-  }
-  if (less >= (n+1)/2) return maxltguess;
-  else if (less+equal >= (n+1)/2) return guess; 
-  else return mingtguess;
-}
-
-void
-naive_median_permutation(vector_t *vector, tensor_t *tensor, slice_distance_t distance)
-{
-  uint     i, j, k;
-  uint     n;
-  uint     best, difference;
-  vector_t *mean;
-  matrix_t *matrix;
-  double   **D;
-  uint     *V, *M;
-  bool     *seen;
-  
-  debug("naive_median_permutation(vector=0x%x, tensor=0x%x, distance=0x%x)\n", 
-	vector, tensor, distance);
-  
-  n      = tensor->n;
-  matrix = matrix_malloc(n, n);
-  mean   = vector_malloc(n);
-  D      = matrix->data;
-  V      = vector->data;
-  M      = mean->data;
-  
-  matrix_clear(matrix);
-  
-  for (j = 0; j < n; ++j) {
-    best = n*n+1;
-    for (i = 0; i < n; ++i) {
-      if (i != j) {
-	D[i][j] = (*distance)(tensor, i, j);
-	if (best > D[i][j]) {
-	  best = D[i][j];
-	  V[0] = i;
-	  V[1] = j;
-	  DEBUG("permutation: best(%d, %d)=%d\n", i, j, best);
-	}
-      }
-    }
-  }
-  
-  for (i = 0; i < n; ++i) {
-    M[i] = non_destructive_median(D[i], n, i);
-  }
-  
-  DEBUG("permutation: best=%d, V[0]=%d, V[1]=%d\n", best, V[0], V[1]);
-#if 0
-  matrix_fwrite(stdout, matrix, format::coordinate);
-#endif
-  
-  seen = MALLOC_N(bool, n);
-  for (i = 0; i < n; ++i) {
-    seen[i] = false;
-  }
-  
-  seen[V[0]] = true;
-  seen[V[1]] = true;
-  
-  for (j = 2; j < n; ++j) {
-    best = n*n+1;
-    k    = 0;
-    for (i = 0; i < n; ++i) {
-      if (!seen[i] && i != j) {
-	difference = fabs(D[i][j]-M[i]);
-	DEBUG("permutation: looking-at(%d, %d)=%lf (difference=%lf)\n", i, j, D[i][j], difference);
-	if (best > difference) {
-	  best = difference;
-	  k    = i;
-	  DEBUG("permutation: best(%d, %d)=%d\n", i, j, best);
-	}
-      }
-    }
-    V[j]       = k;
-    seen[V[j]] = true;
-    DEBUG("permutation: best=%d, V[%d]=%d, V[%d]=%d\n", best, j, V[j-1], j, V[j]);
-    DEBUG("permutation: seen=%d\n", k);
-  }
-  
-  safe_free(seen);
-  safe_free(matrix);
-  
-#if 0  
-  vector_fwrite(stdout, mean);
-  vector_fwrite(stdout, vector);
-#endif
-}
-
-
-
-tensor_t*
-tensor_apply_permutation(tensor_t *source, vector_t *vector)
-{
-  uint                        i, i1, i2, r0, r;
-  uint                        n, rn, nnz, offset;
-  uint const                  *R1, *K1, *V;
-  uint                        *R2, *K2;
-  tensor_storage_compressed_t *storage;
-  tensor_t                    *destination;
-  double                      *V1, *V2;
-  
-  superfluous("tensor_apply_permutation(vector=0x%x, vector=%0x%x)\n", source, vector);
-  
-  V           = vector->data;
-  
-  n           = source->n;
-  nnz         = source->nnz;
-  storage     = STORAGE_COMPRESSED(source);
-  rn          = storage->rn;
-  R1          = storage->RO;
-  K1          = storage->KO;
-  V1          = source->values;
-  
-  destination = tensor_malloc(n, n, n, nnz, strategy::slice, orientation::frontal);
-  storage     = STORAGE_COMPRESSED(destination);
-  R2          = storage->RO;
-  K2          = storage->KO;
-  V2          = destination->values;
-  storage->rn = rn;
-  
-  offset      = 0;
-  R2[0]       = 0;
-  
-  for (i = 0; i < n; ++i) {
-    r0 = R1[V[i]];
-    r  = R1[V[i]+1];
-    i2 = offset;
-    DEBUG("> r0=R1[V[i=%d]  =%d]=%d\n", i, V[i], r0);
-    DEBUG("> r =R1[V[i=%d]+1=%d]=%d\n", i, V[i]+1, r);
-    DEBUG("> i2=%d\n", i2);
-    for (i1 = r0; i1 < r && i2 < nnz; ++i1, ++i2) {
-      K2[i2] = K1[i1];
-      V2[i2] = V1[i1];
-      DEBUG("K2[i2=%d]=%d; K1[i1=%d]=%d\n",   i2, K2[i2], i1, K1[i1]);
-      DEBUG("V2[i2=%d]=%lf; V1[i1=%d]=%lf\n", i2, V2[i2], i1, V1[i1]);
-    }
-    offset  += r - r0;
-    R2[i+1]  = offset; 
-    DEBUG("< R2[i+1=%d]=%d+%d-%d=%d\n", i+1, offset, r, r0, R2[i+1]);
-  }
-  
-  R2[i] = nnz;
-  
-#if 0
-  tensor_fwrite(stdout, destination);
-#endif
-  
-  return destination;
-}
-
-
-void
-permutation_supported(tensor_t *tensor)
-{
-  debug("permutation_supported(tensor=0x%x)\n", tensor);
-  
-  if (tensor->strategy != strategy::coordinate) {
-    die("permutation_supported: the tensor strategy '%s' is not supported.\n", 
-	strategy_to_string(tensor->strategy));
-  }
-}
-
-
-tensor_t*
-tensor_permute(tensor_t *tensor, permutation_heuristic::type_t heuristic)
-{
-  vector_t            *vector;
-  tensor_t            *frontal, *permuted, *coordinate;
-  slice_distance_t    distance;
-  slice_permutation_t permutation;
-  
-  debug("tensor_permute(tensor=0x%x, heuristic='%s')\n",
-	tensor, permutation_heuristic_to_string(heuristic));
-  
-  permutation_supported(tensor);
-  
-  distance = &slice_distance;
-  vector   = vector_malloc(tensor->n);
-  frontal  = tensor_convert(tensor, strategy::slice, orientation::frontal);
-  
-#if 0
-  message("compressed frontal slice:\n");
-  tensor_fwrite(stdout, frontal);
-#endif
-  
-  switch (heuristic) {
-  case permutation_heuristic::naive_minimum:
-    permutation = &naive_minimum_permutation;
-    break;
-  case permutation_heuristic::naive_median:
-    permutation = &naive_median_permutation;
-    break;
-  default:
-    die("Heuristic '%d' is not supported.\n", heuristic);
-    break;
-  }
-  
-  (*permutation)(vector, frontal, distance);
-  permuted = tensor_apply_permutation(frontal, vector);
-  tensor_free(frontal);
-  vector_free(vector);
-  
-#if 0
-  message("compressed frontal slice (permuted):\n");
-  tensor_fwrite(stdout, permuted);
-#endif
-  
-  coordinate = tensor_convert(permuted, strategy::coordinate);
-  tensor_free(permuted);
-  
-#if 0
-  message("coordinate:\n");
-  tensor_fwrite(stdout, coordinate);
-#endif
-  
-  return coordinate;
-}
-
diff --git a/src/tensor_storage_compressed.cc b/src/tensor_storage_compressed.cc
deleted file mode 100644
index a26e1d1..0000000
--- a/src/tensor_storage_compressed.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-
-#include "error.h"
-#include "memory.h"
-#include "mmio.h"
-#include "tensor.h"
-#include "storage.h"
-#include "utility.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-void
-copier_for_row(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i)
-{
-  destination->CO[i] = source->tuples[i].j;
-  destination->KO[i] = source->tuples[i].k;
-}
-
-void
-copier_for_column(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i)
-{
-  destination->CO[i] = source->tuples[i].i;
-  destination->KO[i] = source->tuples[i].k;
-}
-
-void
-copier_for_tube(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i)
-{
-  destination->CO[i] = source->tuples[i].k;
-  destination->KO[i] = source->tuples[i].j;
-}
-
-void
-tensor_storage_convert_from_coordinate_to_compressed(tensor_t *destination, tensor_t *source)
-{
-  int                         n, nnz;
-  tensor_storage_base_t       *base;
-  tensor_storage_compressed_t *d;
-  tensor_storage_coordinate_t *s;
-  coordinate_tuple_t          *tuples;
-  double                      *values;
-  
-  s = STORAGE_COORIDINATE(source);
-  d = STORAGE_COMPRESSED(destination);
-  
-  debug("tensor_storage_convert_from_coordinate_to_compressed(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  base   = STORAGE_BASE(destination);
-  nnz    = source->nnz;
-  n      = source->n;
-  values = source->values;
-  tuples = s->tuples;
-  
-  qsort(tuples, nnz, sizeof(coordinate_tuple_t), base->callbacks->index_compare);
-  d->rn = tensor_storage_index_encode(d->RO, n, tuples, nnz, base->callbacks->index_r_encoder);
-  tensor_storage_copy(d, s, nnz, base->callbacks->index_copy);
-  tensor_storage_copy(destination, source, nnz, (index_copy_t) &copier_for_values);
-}
-
-tensor_storage_compressed_t*
-tensor_storage_malloc_compressed(tensor_t const *tensor)
-{
-  tensor_storage_base_t       *base;
-  tensor_storage_compressed_t *storage;
-  conversion_callbacks_t      *callbacks;
-  
-  superfluous("tensor_storage_malloc_compressed(tensor=0x%x)\n", tensor);
-  
-  storage     = MALLOC(tensor_storage_compressed_t);
-  storage->rn = 0;
-  storage->cn = tensor->nnz;
-  storage->kn = tensor->nnz;
-  storage->RO = NULL;
-  storage->CO = MALLOC_N(uint, storage->cn);
-  storage->TO = MALLOC_N(uint, storage->cn);
-  storage->KO = MALLOC_N(uint, storage->kn);
-  
-  debug("tensor_storage_malloc_compressed: rn=%d, kn=%d\n", storage->rn, storage->kn);
-  
-  callbacks                  = MALLOC(conversion_callbacks_t);
-  callbacks->index_compare   = NULL;
-  callbacks->index_r_encoder = NULL;
-  callbacks->index_copy	     = NULL;
-  
-  switch (tensor->orientation) {
-  case orientation::row:
-    storage->rn                = tensor->m;
-    callbacks->index_compare   = (index_compare_t) &index_compare_ikj;
-    callbacks->index_r_encoder = &encoder_for_i;
-    callbacks->index_copy      = (index_copy_t) &copier_for_row;
-    break;
-  case orientation::column:
-    storage->rn                = tensor->n;
-    callbacks->index_compare   = (index_compare_t) &index_compare_jki;
-    callbacks->index_r_encoder = &encoder_for_j;
-    callbacks->index_copy      = (index_copy_t) &copier_for_column;
-    break;
-  case orientation::tube:
-    storage->rn                = tensor->l;
-    callbacks->index_compare   = (index_compare_t) &index_compare_ijk;
-    callbacks->index_r_encoder = &encoder_for_i;
-    callbacks->index_copy      = (index_copy_t) &copier_for_tube;
-    break;
-  default:
-    die("tensor_storage_malloc_compressed: "
-	"unknown or unsupported orientation %d.\n", 
-	tensor->orientation);
-    break;
-  }
-  
-  storage->rn    += 1;
-  storage->RO     = MALLOC_N(uint, storage->rn);
-  base            = (tensor_storage_base_t*) storage;
-  base->callbacks = callbacks;
-  
-  superfluous("tensor_storage_malloc_compressed: callbacks=0x%x\n", callbacks);
-  superfluous("tensor_storage_malloc_compressed: storage->CO=0x%x\n", storage->CO);
-  superfluous("tensor_storage_malloc_compressed: storage->KO=0x%x\n", storage->KO);
-  superfluous("tensor_storage_malloc_compressed: storage->size (of RO)=%d\n", storage->rn);
-  superfluous("tensor_storage_malloc_compressed: storage->RO=0x%x\n", storage->RO);
-  superfluous("tensor_storage_malloc_compressed: storage=0x%x\n", storage);
-  
-  return storage;
-}
diff --git a/src/tensor_storage_compressed_slice.cc b/src/tensor_storage_compressed_slice.cc
deleted file mode 100644
index dc2e83f..0000000
--- a/src/tensor_storage_compressed_slice.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-
-#include "error.h"
-#include "memory.h"
-#include "mmio.h"
-#include "tensor.h"
-#include "storage.h"
-#include "utility.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-static uint g_n;
-
-void
-copier_for_slice_lateral(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i)
-{
-  destination->KO[i] = source->tuples[i].i * g_n + source->tuples[i].k;
-}
-
-void
-copier_for_slice_horizontal(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i)
-{
-  destination->KO[i] = source->tuples[i].j * g_n + source->tuples[i].k;
-}
-
-void
-copier_for_slice_frontal(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i)
-{
-  destination->KO[i] = source->tuples[i].i * g_n + source->tuples[i].j;
-  
-#if 0
-  debug("copier_for_slice_frontal: KO[i=%u]=(i=%u) * (n=%u) + (j=%u)=%u\n", 
-	i, source->tuples[i].i, g_n, source->tuples[i].j, destination->KO[i]);
-#endif
-}
-
-void
-tensor_storage_convert_from_coordinate_to_compressed_slice(tensor_t *destination, tensor_t *source)
-{
-  uint                        n, nnz;
-  tensor_storage_base_t       *base;
-  tensor_storage_compressed_t *d;
-  tensor_storage_coordinate_t *s;
-  coordinate_tuple_t          *tuples;
-  double                      *values;
-  
-  s = STORAGE_COORIDINATE(source);
-  d = STORAGE_COMPRESSED(destination);
-  
-  debug("tensor_storage_convert_from_coordinate_to_compressed_slice(destination=0x%x, source=0x%x)\n", destination, source);
-
-  base   = STORAGE_BASE(destination);
-  nnz    = source->nnz;
-  n      = source->n;
-  values = source->values;
-  tuples = s->tuples;
-  g_n    = source->n;
-  
-  qsort(tuples, nnz, sizeof(coordinate_tuple_t), base->callbacks->index_compare);
-  d->rn = tensor_storage_index_encode(d->RO, n, tuples, nnz, base->callbacks->index_r_encoder);
-  tensor_storage_copy(d, s, nnz, base->callbacks->index_copy);
-  tensor_storage_copy(destination, source, nnz, (index_copy_t) &copier_for_values);
-}
-
-tensor_storage_compressed_t*
-tensor_storage_malloc_compressed_slice(tensor_t const *tensor)
-{
-  tensor_storage_base_t       *base;
-  tensor_storage_compressed_t *storage;
-  conversion_callbacks_t      *callbacks;
-  
-  superfluous("tensor_storage_malloc_compressed_slice(tensor=0x%x)\n", tensor);
-  
-  storage     = MALLOC(tensor_storage_compressed_t);
-  storage->rn = tensor->n * tensor->n + 1;
-  storage->kn = tensor->nnz;
-  storage->RO = MALLOC_N(uint, storage->rn);
-  storage->CO = NULL;
-  storage->TO = NULL;
-  storage->KO = MALLOC_N(uint, storage->kn);
-  
-  debug("tensor_storage_malloc_compressed_slice: rn=%d, kn=%d\n", storage->rn, storage->kn);
-  
-  callbacks                  = MALLOC(conversion_callbacks_t);
-  callbacks->index_compare   = NULL;
-  callbacks->index_r_encoder = NULL;
-  callbacks->index_copy	     = NULL;
-  
-  switch (tensor->orientation) {
-  case orientation::lateral:
-    callbacks->index_compare   = (index_compare_t) &index_compare_jik;
-    callbacks->index_r_encoder = &encoder_for_j;
-    callbacks->index_copy      = (index_copy_t) &copier_for_slice_lateral;
-    break;
-  case orientation::horizontal:
-    callbacks->index_compare   = (index_compare_t) &index_compare_ijk;
-    callbacks->index_r_encoder = &encoder_for_i;
-    callbacks->index_copy      = (index_copy_t) &copier_for_slice_horizontal;
-    break;
-  case orientation::frontal:
-    callbacks->index_compare   = (index_compare_t) &index_compare_kij;
-    callbacks->index_r_encoder = &encoder_for_k;
-    callbacks->index_copy      = (index_copy_t) &copier_for_slice_frontal;
-    break;
-  default:
-    die("tensor_storage_malloc_compressed_slice: "
-	"unknown or unsupported orientation %d.\n", 
-	tensor->orientation);
-    break;
-  }
-  
-  base            = (tensor_storage_base_t*) storage;
-  base->callbacks = callbacks;
-  
-  superfluous("tensor_storage_malloc_compressed_slice: callbacks=0x%x\n", callbacks);
-  superfluous("tensor_storage_malloc_compressed_slice: storage->RO=0x%x\n", storage->RO);
-  superfluous("tensor_storage_malloc_compressed_slice: storage->CO=0x%x\n", storage->CO);
-  superfluous("tensor_storage_malloc_compressed_slice: storage->TO=0x%x\n", storage->TO);
-  superfluous("tensor_storage_malloc_compressed_slice: storage->KO=0x%x\n", storage->KO);
-  superfluous("tensor_storage_malloc_compressed_slice: storage=0x%x\n", storage);
-  
-  return storage;
-}
diff --git a/src/tensor_storage_convert.cc b/src/tensor_storage_convert.cc
deleted file mode 100644
index 91202c6..0000000
--- a/src/tensor_storage_convert.cc
+++ /dev/null
@@ -1,230 +0,0 @@
-
-#include "error.h"
-#include "memory.h"
-#include "mmio.h"
-#include "storage.h"
-#include "tensor.h"
-#include "utility.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-void
-convert_from_compressed_to_coordinate(tensor_t *destination, tensor_t *source)
-{
-  debug("convert_from_compressed_to_coordinate(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  tensor_storage_convert_from_compressed_to_coordinate(destination, source);
-  
-#if 0
-  switch (destination->orientation) {
-  case orientation::tube:
-    tensor_storage_convert_from_compressed_to_coordinate(destination, source);
-    break;
-  case orientation::row:
-  case orientation::column:
-    tensor_storage_convert_from_compressed_to_coordinate(destination, source);
-    break;
-  case orientation::lateral:
-  case orientation::horizontal:
-  case orientation::frontal:
-    tensor_storage_convert_from_compressed_slice_to_coordinate(destination, source);
-    break;
-  default:
-    die("Conversion to orientation '%s' (%d) is not currently supported.\n",
-	orientation_to_string(destination->orientation), destination->orientation);
-    break;
-  }
-#endif
-}
-
-void
-convert_to_coordinate(tensor_t *destination, tensor_t *source)
-{
-  debug("convert_to_coordinate(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  switch (source->strategy) {
-  case strategy::compressed:
-  case strategy::slice:
-    convert_from_compressed_to_coordinate(destination, source);
-    break;
-  default:
-    die("Conversion from '%s' strategy to '%s' is not currently supported.\n",
-	strategy_to_string(source->strategy), 
-	strategy_to_string(destination->strategy));
-    break;
-  }
-}
-
-void
-convert_from_coordinate_to_compressed(tensor_t *destination, tensor_t *source)
-{
-  debug("convert_from_coordinate_to_compressed(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  switch (destination->orientation) {
-  case orientation::row:
-  case orientation::column:
-  case orientation::tube:
-    tensor_storage_convert_from_coordinate_to_compressed(destination, source);
-    break;
-  case orientation::lateral:
-  case orientation::horizontal:
-  case orientation::frontal:
-    tensor_storage_convert_from_coordinate_to_compressed_slice(destination, source);
-    break;
-  default:
-    die("Conversion to orientation '%s' is not currently supported.\n",
-	orientation_to_string(destination->orientation));
-    break;
-  }
-}
-
-void
-convert_from_coordinate_to_gundersen(tensor_t *destination, tensor_t *source)
-{
-  debug("convert_from_coordinate_to_gundersen(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  switch (destination->orientation) {
-  case orientation::tube:
-    tensor_storage_convert_from_coordinate_to_gundersen(destination, source);
-    break;
-  default:
-    die("Conversion to orientation '%s' is not currently supported.\n",
-	orientation_to_string(destination->orientation));
-    break;
-  }
-}
-
-void
-convert_from_coordinate_to_ekmr(tensor_t *destination, tensor_t *source)
-{
-  debug("convert_from_coordinate_to_ekmr(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  switch (destination->orientation) {
-  case orientation::row:
-  case orientation::column:
-  case orientation::tube:
-    tensor_storage_convert_from_coordinate_to_ekmr(destination, source);
-    break;
-  default:
-    die("Conversion to orientation '%s' is not currently supported.\n",
-	orientation_to_string(destination->orientation));
-    break;
-  }
-}
-
-void
-convert_from_coordinate_to_zzekmr(tensor_t *destination, tensor_t *source)
-{
-  debug("convert_from_coordinate_to_zzekmr(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  switch (destination->orientation) {
-  case orientation::row:
-  case orientation::column:
-  case orientation::tube:
-    tensor_storage_convert_from_coordinate_to_zzekmr(destination, source);
-    break;
-  default:
-    die("Conversion to orientation '%s' is not currently supported.\n",
-	orientation_to_string(destination->orientation));
-    break;
-  }
-}
-
-void
-convert_to_compressed(tensor_t *destination, tensor_t *source)
-{
-  debug("convert_to_compressed(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  switch (source->strategy) {
-  case strategy::coordinate:
-    convert_from_coordinate_to_compressed(destination, source);
-    break;
-  default:
-    die("Conversion from '%s' strategy to '%s' is not currently supported.\n",
-	strategy_to_string(source->strategy), 
-	strategy_to_string(destination->strategy));
-    break;
-  }
-}
-
-void
-convert_to_gundersen(tensor_t *destination, tensor_t *source)
-{
-  debug("convert_to_gundersen(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  switch (source->strategy) {
-  case strategy::coordinate:
-    convert_from_coordinate_to_gundersen(destination, source);
-    break;
-  default:
-    die("Conversion from '%s' strategy to '%s' is not currently supported.\n",
-	strategy_to_string(source->strategy), 
-	strategy_to_string(destination->strategy));
-    break;
-  }
-}
-
-void
-convert_to_ekmr(tensor_t *destination, tensor_t *source)
-{
-  debug("convert_to_ekmr(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  switch (source->strategy) {
-  case strategy::coordinate:
-    convert_from_coordinate_to_ekmr(destination, source);
-    break;
-  default:
-    die("Conversion from '%s' strategy to '%s' is not currently supported.\n",
-	strategy_to_string(source->strategy), 
-	strategy_to_string(destination->strategy));
-    break;
-  }
-}
-
-void
-convert_to_zzekmr(tensor_t *destination, tensor_t *source)
-{
-  debug("convert_to_zzekmr(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  switch (source->strategy) {
-  case strategy::coordinate:
-    convert_from_coordinate_to_zzekmr(destination, source);
-    break;
-  default:
-    die("Conversion from '%s' strategy to '%s' is not currently supported.\n",
-	strategy_to_string(source->strategy), 
-	strategy_to_string(destination->strategy));
-    break;
-  }
-}
-
-void
-tensor_storage_convert(tensor_t *destination, tensor_t *source)
-{
-  debug("tensor_storage_convert(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  switch (destination->strategy) {
-  case strategy::coordinate:
-    convert_to_coordinate(destination, source);
-    break;
-  case strategy::compressed:
-  case strategy::slice:
-    convert_to_compressed(destination, source);
-    break;
-  case strategy::gundersen:
-    convert_to_gundersen(destination, source);
-    break;
-  case strategy::ekmr:
-    convert_to_ekmr(destination, source);
-    break;
-  case strategy::zzekmr:
-    convert_to_zzekmr(destination, source);
-    break;
-  default:
-    die("Conversion from '%s' strategy to '%s' is not currently supported.\n",
-	strategy_to_string(source->strategy), 
-	strategy_to_string(destination->strategy));
-    break;
-  }
-}
-
diff --git a/src/tensor_storage_coordinate.cc b/src/tensor_storage_coordinate.cc
deleted file mode 100644
index c82e955..0000000
--- a/src/tensor_storage_coordinate.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-
-#include "error.h"
-#include "memory.h"
-#include "mmio.h"
-#include "tensor.h"
-#include "storage.h"
-#include "utility.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-void
-tensor_storage_convert_from_compressed_tube_to_coordinate(tensor_t *destination, tensor_t *source)
-{
-  uint                        i, t, r0, r;
-  uint                        n, rn, nnz;
-  tensor_storage_coordinate_t *d;
-  tensor_storage_compressed_t *s;
-  coordinate_tuple_t          *T;
-  double                      *V;
-  uint                        *R, *C, *K;
-  
-  s = STORAGE_COMPRESSED(source);
-  d = STORAGE_COORIDINATE(destination);
-  
-  debug("tensor_storage_convert_from_compressed_tube_to_coordinate(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  nnz = source->nnz;
-  T   = d->tuples;
-  
-  n   = source->n;
-  rn  = s->rn;
-  R   = s->RO;
-  C   = s->CO;
-  K   = s->KO;
-  V   = source->values;
-  
-  for (r = 1, t = 0; r < rn; ++r) {
-    r0 = r-1;
-    for (i = R[r0]; i < R[r]; ++i, ++t) {
-      T[t].i     = r0;
-      T[t].j     = K[i];
-      T[t].k     = C[i];
-      T[t].index = i;
-      
-    }
-  }
-  
-  for (i = 0; i < nnz; ++i) {
-    destination->values[i] = source->values[i];
-  }
-}
-
-void
-tensor_storage_convert_from_compressed_slice_to_coordinate(tensor_t *destination, tensor_t *source)
-{
-  uint                        i, t, r0, r;
-  uint                        n, rn, nnz;
-  tensor_storage_coordinate_t *d;
-  tensor_storage_compressed_t *s;
-  coordinate_tuple_t          *T;
-  double                      *V;
-  uint                        *R, *C, *K;
-  
-  s = STORAGE_COMPRESSED(source);
-  d = STORAGE_COORIDINATE(destination);
-  
-  debug("tensor_storage_convert_from_compressed_slice_to_coordinate(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  nnz = source->nnz;
-  T   = d->tuples;
-  
-  n   = source->n;
-  rn  = s->rn;
-  R   = s->RO;
-  C   = s->CO;
-  K   = s->KO;
-  V   = source->values;
-  
-  for (r = 1, t = 0; r < rn; ++r) {
-    r0 = r-1;
-    DEBUG("R[r0=%u]=%u, R[r=%u]=%u\n", r0, R[r0], r, R[r]);
-    for (i = R[r0]; i < R[r]; ++i, ++t) {
-      DEBUG("K[i=%u]=%u\n", i, K[i]);
-      T[t].i     = K[i] / n;
-      T[t].j     = K[i] % n;
-      T[t].k     = r0 % n;
-      T[t].index = i;
-      DEBUG("i=%u, j=%u, k=%u, index=%u\n", T[t].i, T[t].j, T[t].k, T[t].index);
-    }
-  }
-  
-  for (i = 0; i < nnz; ++i) {
-    destination->values[i] = source->values[i];
-  }
-}
-
-void
-tensor_storage_convert_from_compressed_to_coordinate(tensor_t *destination, tensor_t *source)
-{
-  debug("tensor_storage_convert_from_compressed_to_coordinate(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  switch (source->strategy) {
-  case strategy::compressed:
-    tensor_storage_convert_from_compressed_tube_to_coordinate(destination, source);
-    break;
-  case strategy::slice:
-    tensor_storage_convert_from_compressed_slice_to_coordinate(destination, source);
-    break;
-  default:
-    die("tensor_storage_convert_from_compressed_to_coordinate: "
-	"unknown or unsupported strategy %d.\n", 
-	source->strategy);
-    break;
-  }
-}
-
-tensor_storage_coordinate_t*
-tensor_storage_malloc_coordinate(tensor_t const *tensor)
-{
-  tensor_storage_base_t       *base;
-  tensor_storage_coordinate_t *storage;
-  
-  superfluous("tensor_storage_malloc_coordinate(tensor=0x%x, nnz=%d)\n", tensor, tensor->nnz);
-  
-  storage         = MALLOC(tensor_storage_coordinate_t);
-  storage->tuples = MALLOC_N(coordinate_tuple_t, tensor->nnz);
-  base            = (tensor_storage_base_t*) storage;
-  base->callbacks = NULL;
-  
-  superfluous("tensor_storage_malloc_coordinate: storage->tuples=0x%x\n", storage->tuples);
-  superfluous("tensor_storage_malloc_coordinate: storage=0x%x\n", storage);
-  
-  return storage;
-}
diff --git a/src/tensor_storage_ekmr.cc b/src/tensor_storage_ekmr.cc
deleted file mode 100644
index 3b64028..0000000
--- a/src/tensor_storage_ekmr.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-
-#include "error.h"
-#include "memory.h"
-#include "mmio.h"
-#include "tensor.h"
-#include "storage.h"
-#include "utility.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-void
-index_copy_for_ekmr_row(void *destination, void const *source, uint nnz)
-{
-  uint                              i, n;
-  tensor_storage_coordinate_t const *s;
-  tensor_storage_extended_t         *d;
-  
-  s = (tensor_storage_coordinate_t const*) source;
-  d = (tensor_storage_extended_t*) destination;
-  n = d->rn - 1;
-  
-  debug("index_copy_for_ekmr_row(destination=0x%x, source=0x%x, nnz=%d)\n", d, s, nnz);
-  
-  for (i = 0; i < nnz; ++i) {
-    d->CK[i] = s->tuples[i].j * n + s->tuples[i].k;
-  }
-}
-
-void
-tensor_storage_convert_from_coordinate_to_ekmr(tensor_t *destination, tensor_t *source)
-{
-  int                         n, nnz;
-  tensor_storage_base_t       *base;
-  tensor_storage_extended_t   *d;
-  tensor_storage_coordinate_t *s;
-  coordinate_tuple_t          *tuples;
-  double                      *values;
-  
-  s = STORAGE_COORIDINATE(source);
-  d = STORAGE_EXTENDED(destination);
-  
-  debug("tensor_storage_convert_from_coordinate_to_ekmr(destination=0x%x, source=0x%x)\n", d, s);
-  
-  base   = STORAGE_BASE(destination);
-  nnz    = source->nnz;
-  n      = source->n;
-  values = source->values;
-  tuples = s->tuples;
-  
-  qsort(tuples, nnz, sizeof(coordinate_tuple_t), base->callbacks->index_compare);
-  d->rn = tensor_storage_index_encode(d->RO, n, tuples, nnz, base->callbacks->index_r_encoder);
-  tensor_storage_copy(d, s, nnz, base->callbacks->index_copy);
-  tensor_storage_copy(destination, source, nnz, (index_copy_t) &copier_for_values);
-}
-
-tensor_storage_extended_t*
-tensor_storage_malloc_ekmr(tensor_t const *tensor)
-{
-  tensor_storage_base_t     *base;
-  tensor_storage_extended_t *storage;
-  conversion_callbacks_t    *callbacks;
-  
-  superfluous("tensor_storage_malloc_ekmr(tensor=0x%x)\n", tensor);
-  
-  storage      = MALLOC(tensor_storage_extended_t);
-  storage->rn  = 0;
-  storage->ckn = tensor->nnz;
-  storage->RO  = NULL;
-  storage->CK  = MALLOC_N(uint, storage->ckn);
-  
-  callbacks                  = MALLOC(conversion_callbacks_t);
-  callbacks->index_compare   = NULL;
-  callbacks->index_r_encoder = NULL;
-  callbacks->index_copy	     = NULL;
-  
-  switch (tensor->orientation) {
-  case orientation::row:
-    storage->rn                = tensor->n;
-    callbacks->index_compare   = (index_compare_t) &index_compare_ijk;
-    callbacks->index_r_encoder = &encoder_for_i;
-    callbacks->index_copy      = &index_copy_for_ekmr_row;
-    break;
-  default:
-    die("Tensor orientation '%s' not yet supported.\n", orientation_to_string(tensor->orientation));
-    break;
-  }
-  
-  storage->rn    += 1;
-  storage->RO     = MALLOC_N(uint, storage->rn);
-  base            = (tensor_storage_base_t*) storage;
-  base->callbacks = callbacks;
-  
-  superfluous("tensor_storage_malloc_ekmr: callbacks=0x%x\n", callbacks);  
-  superfluous("tensor_storage_malloc_ekmr: storage->CK=0x%x\n", storage->CK);
-  superfluous("tensor_storage_malloc_ekmr: storage->size (of R)=%d\n", storage->rn);
-  superfluous("tensor_storage_malloc_ekmr: storage->RO=0x%x\n", storage->RO);
-  superfluous("tensor_storage_malloc_ekmr: storage=0x%x\n", storage);
-  
-  return storage;
-}
diff --git a/src/tensor_storage_gundersen.cc b/src/tensor_storage_gundersen.cc
deleted file mode 100644
index c33f97c..0000000
--- a/src/tensor_storage_gundersen.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-
-#include "error.h"
-#include "memory.h"
-#include "mmio.h"
-#include "tensor.h"
-#include "storage.h"
-#include "utility.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-void
-tensor_storage_convert_from_coordinate_to_gundersen(tensor_t *destination, tensor_t *source)
-{
-  uint                        nnz;
-  uint                        rn, cn, index, current, prev_ri, prev_ci;
-  tensor_storage_base_t       *base;
-  tensor_storage_compressed_t *d;
-  tensor_storage_coordinate_t *s;
-  coordinate_tuple_t          *tuples;
-  double                      *values;
-  uint                        *R, *C, *K;
-  index_encoder_t             r_encoder, c_encoder;
-  
-  debug("tensor_storage_convert_from_coordinate_to_gundersen(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  nnz       = source->nnz;
-  values    = source->values;
-  
-  base      = STORAGE_BASE(destination);
-  r_encoder = base->callbacks->index_r_encoder;
-  c_encoder = base->callbacks->index_c_encoder;
-  
-  s         = STORAGE_COORIDINATE(source);
-  d         = STORAGE_COMPRESSED(destination);
-  tuples    = s->tuples;
-  R         = d->RO;
-  C         = d->CO;
-  K         = d->KO;
-  
-  qsort(tuples, nnz, sizeof(coordinate_tuple_t), base->callbacks->index_compare);
-  tensor_storage_copy(d, s, nnz, base->callbacks->index_copy);
-  tensor_storage_copy(destination, source, nnz, (index_copy_t) &copier_for_values);
-  
-  rn      = 0;
-  cn      = 0;
-  prev_ri = r_encoder(&tuples[0]);
-  prev_ci = c_encoder(&tuples[0]);
-  
-  C[cn++] = 0;
-  R[rn++] = 1;
-  
-  for (current = 0; current < nnz; ++current) {
-    DEBUG("i=%u, j=%u, k=%u, index=%u\n", 
-	  tuples[current].i, tuples[current].j, 
-	  tuples[current].k, tuples[current].index);
-    index = base->callbacks->index_c_encoder(&tuples[current]);
-    if (prev_ci != index) {
-      DEBUG("C[size=%u]=%u\n", cn, current);
-      C[cn++] = current;
-      prev_ci = index;
-    }
-    index = base->callbacks->index_r_encoder(&tuples[current]);
-    if (prev_ri != index) {
-      DEBUG("R[size=%u]=%u\n", rn, current);
-      R[rn++] = cn;
-      prev_ri = index;
-    }
-  }
-  
-  DEBUG("C[size=%u]=%u\n", cn, nnz);
-  DEBUG("R[size=%u]=%u\n", rn, cn);
-  
-  C[cn++] = nnz;
-  R[rn++] = cn;
-  
-  DEBUG("rn=%u\n", rn);
-  DEBUG("cn=%u\n", cn);
-  
-  d->rn = rn;
-  d->cn = cn;
-}
-
-tensor_storage_compressed_t*
-tensor_storage_compressed_gundersen(tensor_t const *tensor)
-{
-  tensor_storage_base_t       *base;
-  tensor_storage_compressed_t *storage;
-  conversion_callbacks_t      *callbacks;
-  
-  superfluous("tensor_storage_compressed_gundersen(tensor=0x%x)\n", tensor);
-  
-  storage     = MALLOC(tensor_storage_compressed_t);
-  storage->rn = tensor->n + 1;
-  storage->cn = tensor->n * tensor->n + 1;
-  storage->tn = tensor->n * tensor->n + 1;
-  storage->kn = tensor->nnz;
-  storage->RO = MALLOC_N(uint, storage->rn);
-  storage->CO = MALLOC_N(uint, storage->cn);
-  storage->TO = MALLOC_N(uint, storage->tn);
-  storage->KO = MALLOC_N(uint, storage->kn);
-  
-  debug("tensor_storage_compressed_gundersen: rn=%d, cn=%d, tn=%d, kn=%d\n", 
-	storage->rn, storage->cn, storage->tn, storage->kn);
-  
-  callbacks                  = MALLOC(conversion_callbacks_t);
-  callbacks->index_compare   = NULL;
-  callbacks->index_r_encoder = NULL;
-  callbacks->index_c_encoder = NULL;
-  callbacks->index_copy	     = NULL;
-  
-  switch (tensor->orientation) {
-  case orientation::tube:
-    callbacks->index_compare   = (index_compare_t) &index_compare_jik;
-    callbacks->index_r_encoder = &encoder_for_j;
-    callbacks->index_c_encoder = &encoder_for_i;
-    callbacks->index_copy      = (index_copy_t) &copier_for_k;
-    break;
-  default:
-    die("tensor_storage_compressed_gundersen: "
-	"unknown or unsupported orientation %d.\n", 
-	tensor->orientation);
-    break;
-  }
-  
-  base            = (tensor_storage_base_t*) storage;
-  base->callbacks = callbacks;
-  
-  superfluous("tensor_storage_compressed_gundersen: callbacks=0x%x\n", callbacks);
-  superfluous("tensor_storage_malloc_compressed: storage->CO=0x%x\n", storage->CO);
-  superfluous("tensor_storage_malloc_compressed: storage->KO=0x%x\n", storage->KO);
-  superfluous("tensor_storage_compressed_gundersen: storage->size (of RO)=%d\n", storage->rn);
-  superfluous("tensor_storage_compressed_gundersen: storage->RO=0x%x\n", storage->RO);
-  superfluous("tensor_storage_compressed_gundersen: storage=0x%x\n", storage);
-  
-  return storage;
-}
diff --git a/src/tensor_storage_malloc.cc b/src/tensor_storage_malloc.cc
deleted file mode 100644
index 9abd3f7..0000000
--- a/src/tensor_storage_malloc.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-
-#include "error.h"
-#include "memory.h"
-#include "mmio.h"
-#include "storage.h"
-#include "tensor.h"
-#include "utility.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-void*
-tensor_storage_malloc(tensor_t const *tensor)
-{
-  void *storage;
-  
-  superfluous("tensor_storage_malloc(tensor=0x%x [strategy='%s'])\n", 
-	tensor, strategy_to_string(tensor->strategy));
-  
-  storage = NULL;
-  
-  switch (tensor->strategy) {
-  case strategy::coordinate:
-    storage = tensor_storage_malloc_coordinate(tensor);
-    break;
-  case strategy::compressed:
-    storage = tensor_storage_malloc_compressed(tensor);
-    break;
-  case strategy::slice:
-    storage = tensor_storage_malloc_compressed_slice(tensor);
-    break;
-  case strategy::ekmr:
-    storage = tensor_storage_malloc_ekmr(tensor);
-    break;
-  case strategy::zzekmr:
-    storage = tensor_storage_malloc_zzekmr(tensor);
-    break;
-  default:
-    die("Tensor storage strategy '%d' is not supported.\n", tensor->strategy);
-  }
-  
-  superfluous("tensor_storage_malloc: storage=0x%x\n", storage);
-  
-  return storage;
-}
diff --git a/src/tensor_storage_matrix_slice.cc b/src/tensor_storage_matrix_slice.cc
deleted file mode 100644
index dde6577..0000000
--- a/src/tensor_storage_matrix_slice.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-
-#include "error.h"
-#include "memory.h"
-#include "mmio.h"
-#include "tensor.h"
-#include "storage.h"
-#include "utility.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-static uint g_n;
-
-static void
-copier_for_slice_lateral(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i)
-{
-  destination->KO[i] = source->tuples[i].i * g_n + source->tuples[i].k;
-}
-
-static void
-copier_for_slice_horizontal(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i)
-{
-  destination->KO[i] = source->tuples[i].j * g_n + source->tuples[i].k;
-}
-
-static void
-copier_for_slice_frontal(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i)
-{
-  destination->KO[i] = source->tuples[i].i * g_n + source->tuples[i].j;
-}
-
-void
-tensor_storage_convert_from_coordinate_to_matrix_slice(tensor_t *destination, tensor_t *source)
-{
-  uint                        n, nnz;
-  tensor_storage_base_t       *base;
-  tensor_storage_compressed_t *d;
-  tensor_storage_coordinate_t *s;
-  coordinate_tuple_t          *tuples;
-  double                      *values;
-  
-  s = STORAGE_COORIDINATE(source);
-  d = STORAGE_COMPRESSED(destination);
-  
-  debug("tensor_storage_convert_from_coordinate_to_matrix_slice(destination=0x%x, source=0x%x)\n", destination, source);
-
-  base   = STORAGE_BASE(destination);
-  nnz    = source->nnz;
-  n      = source->n;
-  values = source->values;
-  tuples = s->tuples;
-  g_n    = source->n;
-  
-  qsort(tuples, nnz, sizeof(coordinate_tuple_t), base->callbacks->index_compare);
-  d->rn = tensor_storage_index_encode(d->RO, n, tuples, nnz, base->callbacks->index_r_encoder);
-  tensor_storage_copy(d, s, nnz, base->callbacks->index_copy);
-  tensor_storage_copy(destination, source, nnz, (index_copy_t) &copier_for_values);
-}
-
-tensor_storage_compressed_t*
-tensor_storage_malloc_matrix_slice(tensor_t const *tensor)
-{
-  tensor_storage_base_t       *base;
-  tensor_storage_compressed_t *storage;
-  conversion_callbacks_t      *callbacks;
-  
-  superfluous("tensor_storage_malloc_matrix_slice(tensor=0x%x)\n", tensor);
-  
-  storage     = MALLOC(tensor_storage_compressed_t);
-  storage->rn = tensor->n * tensor->n + 1;
-  storage->kn = tensor->nnz;
-  storage->RO = MALLOC_N(uint, storage->rn);
-  storage->CO = NULL;
-  storage->TO = NULL;
-  storage->KO = MALLOC_N(uint, storage->kn);
-  
-  debug("tensor_storage_malloc_matrix_slice: rn=%d, kn=%d\n", storage->rn, storage->kn);
-  
-  callbacks                  = MALLOC(conversion_callbacks_t);
-  callbacks->index_compare   = NULL;
-  callbacks->index_r_encoder = NULL;
-  callbacks->index_copy	     = NULL;
-  
-  switch (tensor->orientation) {
-  case orientation::lateral:
-    callbacks->index_compare   = (index_compare_t) &index_compare_jik;
-    callbacks->index_r_encoder = &encoder_for_j;
-    callbacks->index_copy      = (index_copy_t) &copier_for_slice_lateral;
-    break;
-  case orientation::horizontal:
-    callbacks->index_compare   = (index_compare_t) &index_compare_ijk;
-    callbacks->index_r_encoder = &encoder_for_i;
-    callbacks->index_copy      = (index_copy_t) &copier_for_slice_horizontal;
-    break;
-  case orientation::frontal:
-    callbacks->index_compare   = (index_compare_t) &index_compare_kij;
-    callbacks->index_r_encoder = &encoder_for_k;
-    callbacks->index_copy      = (index_copy_t) &copier_for_slice_frontal;
-    break;
-  default:
-    die("tensor_storage_malloc_matrix_slice: "
-	"unknown or unsupported orientation %d.\n", 
-	tensor->orientation);
-    break;
-  }
-  
-  base            = (tensor_storage_base_t*) storage;
-  base->callbacks = callbacks;
-  
-  superfluous("tensor_storage_malloc_matrix_slice: callbacks=0x%x\n", callbacks);
-  superfluous("tensor_storage_malloc_matrix_slice: storage->RO=0x%x\n", storage->RO);
-  superfluous("tensor_storage_malloc_matrix_slice: storage->CO=0x%x\n", storage->CO);
-  superfluous("tensor_storage_malloc_matrix_slice: storage->TO=0x%x\n", storage->TO);
-  superfluous("tensor_storage_malloc_matrix_slice: storage->KO=0x%x\n", storage->KO);
-  superfluous("tensor_storage_malloc_matrix_slice: storage=0x%x\n", storage);
-  
-  return storage;
-}
diff --git a/src/tensor_storage_utility.cc b/src/tensor_storage_utility.cc
deleted file mode 100644
index aadc198..0000000
--- a/src/tensor_storage_utility.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-
-#include "error.h"
-#include "storage.h"
-#include <string.h>
-
-int
-index_compare_ijk(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb)
-{
-  int result;
-  
-  if (0 == (result = ta->i - tb->i)) {
-    if (0 == (result = ta->j - tb->j)) {
-      result = ta->k - tb->k;
-    }
-  }
-  
-  return result;
-}
-
-int
-index_compare_jik(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb)
-{
-  int result;
-  
-  if (0 == (result = ta->j - tb->j)) {
-    if (0 == (result = ta->i - tb->i)) {
-      result = ta->k - tb->k;
-    }
-  }
-  
-  return result;
-}
-
-int
-index_compare_jki(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb)
-{
-  int result;
-  
-  if (0 == (result = ta->j - tb->j)) {
-    if (0 == (result = ta->k - tb->k)) {
-      result = ta->i - tb->i;
-    }
-  }
-  
-  return result;
-}
-
-int 
-index_compare_kji(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb)
-{
-  int result;
-  
-  if (0 == (result = ta->k - tb->k)) {
-    if (0 == (result = ta->j - tb->j)) {
-      result = ta->i - tb->i;
-    }
-  }
-  
-  return result;
-}
-
-int 
-index_compare_kij(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb)
-{
-  int result;
-  
-  if (0 == (result = ta->k - tb->k)) {
-    if (0 == (result = ta->i - tb->i)) {
-      result = ta->j - tb->j;
-    }
-  }
-  
-  return result;
-}
-
-int 
-index_compare_ikj(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb)
-{
-  int result;
-  
-  if (0 == (result = ta->i - tb->i)) {
-    if (0 == (result = ta->k - tb->k)) {
-      result = ta->j - tb->j;
-    }
-  }
-  
-  return result;
-}
-
-
-uint
-encoder_for_i(coordinate_tuple_t const *tuple)
-{
-  return tuple->i;
-}
-
-uint
-encoder_for_j(coordinate_tuple_t const *tuple)
-{
-  return tuple->j;
-}
-
-uint
-encoder_for_k(coordinate_tuple_t const *tuple)
-{
-  return tuple->k;
-}
-
-uint
-tensor_storage_index_encode(uint *indices, uint n, coordinate_tuple_t const *tuple, uint nnz, index_encoder_t encoder)
-{
-  uint i, t;
-  uint index;
-  
-  debug("tensor_storage_index_encode(indices=0x%x, tuple=0x%x, nnz=%d)\n", indices, tuple, nnz);
-  
-#if 0
-  for (current = 0; current < nnz; ++current) {
-    DEBUG("current=%u: i=%u, j=%u, k=%u, index=%u\n", 
-	  current, tuple[current].i, tuple[current].j,
-	  tuple[current].k, tuple[current].index);
-    index = encoder(&tuple[current]);
-    if (previous != index) {
-      DEBUG("indices[size=%u]=%u\n", size, current);
-      indices[size++] = current;
-      previous        = index;
-    }
-  }
-  
-  DEBUG("indices[size=%u]=%u\n", size, nnz);
-  indices[size++] = nnz;
-  DEBUG("size=%u\n", size);
-#endif
-  
-  indices[0] = 0;
-  index      = encoder(&tuple[0]);
-  
-  for (i = 1; i < index; ++i) {
-    indices[i] = 0;
-  }
-
-  for (t = 0; t < nnz; ++t) {
-    DEBUG("t=%u: i=%u, j=%u, k=%u, index=%u\n", t, tuple[t].i, tuple[t].j, tuple[t].k, tuple[t].index);
-    index = encoder(&tuple[t]);
-    if (i != index) {
-      DEBUG("indices[i=%u]=%u\n", i, t);
-      for (; i < index; ++i) {
-	indices[i] = t;
-      }
-    }
-  }
-  
-  for (; i < n; ++i) {
-    indices[i] = nnz;
-  }
-  
-  DEBUG("indices[i=%u]=%u\n", i, nnz);
-  indices[i++] = nnz;
-  DEBUG("i=%u\n", i);
-  
-  return i;
-}
-
-void
-copier_for_i(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i)
-{
-  destination->KO[i] = source->tuples[i].i;
-}
-
-void
-copier_for_j(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i)
-{
-  destination->KO[i] = source->tuples[i].j;
-}
-
-void
-copier_for_k(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i)
-{
-  destination->KO[i] = source->tuples[i].k;
-}
-
-void
-copier_for_values(tensor_t *destination, tensor_t const *source, uint i)
-{
-  destination->values[i] = source->values[STORAGE_COORIDINATE(source)->tuples[i].index];
-}
-
-void
-tensor_storage_copy(void *destination, void const *source, uint nnz, index_copy_t copier)
-{
-  uint i;
-  
-  debug("storage_index_copy(destination=0x%x, source=0x%x)\n", destination, source);
-  
-  for (i = 0; i < nnz; ++i) {
-    copier(destination, source, i);
-  }
-}
diff --git a/src/tensor_storage_zzekmr.cc b/src/tensor_storage_zzekmr.cc
deleted file mode 100644
index 68cc65e..0000000
--- a/src/tensor_storage_zzekmr.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-
-#include "error.h"
-#include "memory.h"
-#include "mmio.h"
-#include "tensor.h"
-#include "storage.h"
-#include "utility.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-static uint g_r;
-
-int 
-tensor_storage_index_compare_for_zzekmr_row(void const *a, void const *b)
-{
-  uint                     ja, jb;
-  int                      result;
-  coordinate_tuple_t const *ta, *tb;
-  
-  ta = (coordinate_tuple_t const*) a;
-  tb = (coordinate_tuple_t const*) b;
-  ja = ta->j * g_r + ta->k;
-  jb = tb->j * g_r + tb->k;
-  
-  /* We are doing just about exacly what the EKMR encoding does,
-     except we reverse the order of every other row (we assume, for
-     now, that all rows are non-empty).  We do this for a very simple,
-     but elegant reason.  Take for instance the dense vector, sparse
-     tensor product: Say we are at the end of a row, in terms of doing
-     an operation.  The cache will have been primed with the elemets
-     of one of the extremes of the vector.  If we naively pull in the
-     next row, we must also pull in the matching vector elements.
-     However, if we bring in the next row *in reverse order*, we will
-     likely already have the matching vector elements in the
-     cache. Thus, in general, we may not nessearily invalidate the
-     existing cache lines. */
-  
-  if (0 == (result = ta->i - tb->i)) {
-    if (ta->i % 2) { /* odd */
-      result = jb - ja;
-    } else {         /* even */
-      result = ja - jb;
-    }
-  }
-  
-  return result;
-}
-
-void
-tensor_storage_index_copy_for_zzekmr_row(void *destination, void const *source, uint nnz)
-{
-  uint i;
-  tensor_storage_coordinate_t const *s;
-  tensor_storage_extended_t         *d;
-  
-  s = (tensor_storage_coordinate_t const*) source;
-  d = (tensor_storage_extended_t*) destination;
-  
-  debug("tensor_storage_index_copy_for_zzekmr_row(destination=0x%x, source=0x%x, nnz=%d)\n", d, s, nnz);
-  
-  for (i = 0; i < nnz; ++i) {
-    d->CK[i] = s->tuples[i].j * g_r + s->tuples[i].k;
-  }
-}
-
-void
-tensor_storage_convert_from_coordinate_to_zzekmr(tensor_t *destination, tensor_t *source)
-{
-  uint                  i, n, nnz;
-  tensor_storage_base_t       *base;
-  tensor_storage_extended_t   *d;
-  tensor_storage_coordinate_t *s;
-  coordinate_tuple_t   *tuples;
-  double               *values;
-   
-  s = STORAGE_COORIDINATE(source);
-  d = STORAGE_EXTENDED(destination);
-  
-  debug("tensor_storage_convert_from_coordinate_to_zzekmr(destination=0x%x, source=0x%x)\n", d, s);
-  
-  base   = STORAGE_BASE(destination);
-  nnz    = source->nnz;
-  n      = source->n;
-  values = source->values;
-  g_r    = d->rn;
-  tuples = s->tuples;
-  
-  qsort(tuples, nnz, sizeof(coordinate_tuple_t), base->callbacks->index_compare);
-  d->rn = tensor_storage_index_encode(d->RO, n, tuples, nnz, base->callbacks->index_r_encoder);
-  (*base->callbacks->index_copy)(d, s, nnz);
-  
-  for (i = 0; i < nnz; ++i) {
-    destination->values[i] = values[tuples[i].index];
-  }
-}
-
-tensor_storage_extended_t*
-tensor_storage_malloc_zzekmr(tensor_t const *tensor)
-{
-  tensor_storage_base_t     *base;
-  tensor_storage_extended_t *storage;
-  conversion_callbacks_t    *callbacks;
-  
-  superfluous("tensor_storage_malloc_zzekmr(tensor=0x%x)\n", tensor);
-  
-  storage     = MALLOC(tensor_storage_extended_t);
-  storage->CK = MALLOC_N(uint, tensor->nnz);
-  storage->RO = NULL;
-  storage->rn = 0;
-  
-  callbacks                  = MALLOC(conversion_callbacks_t);
-  callbacks->index_compare   = NULL;
-  callbacks->index_r_encoder = NULL;
-  callbacks->index_copy	     = NULL;
-  
-  switch (tensor->orientation) {
-  case orientation::row:
-    storage->rn                = tensor->n;
-    callbacks->index_compare   = &tensor_storage_index_compare_for_zzekmr_row;
-    callbacks->index_r_encoder = &encoder_for_i;
-    callbacks->index_copy      = &tensor_storage_index_copy_for_zzekmr_row;
-    break;
-  default:
-    die("Tensor orientation '%s' not yet supported.\n", orientation_to_string(tensor->orientation));
-    break;
-  }
-  
-  storage->rn    += 1;
-  storage->RO     = MALLOC_N(uint, storage->rn);
-  base            = (tensor_storage_base_t*) storage;
-  base->callbacks = callbacks;
-  
-  superfluous("tensor_storage_malloc_zzekmr: callbacks=0x%x\n", callbacks);  
-  superfluous("tensor_storage_malloc_zzekmr: storage->CK=0x%x\n", storage->CK);
-  superfluous("tensor_storage_malloc_zzekmr: storage->size (of R)=%d\n", storage->rn);
-  superfluous("tensor_storage_malloc_zzekmr: storage->RO=0x%x\n", storage->RO);
-  superfluous("tensor_storage_malloc_zzekmr: storage=0x%x\n", storage);
-  
-  return storage;
-}
diff --git a/src/tool_convert.cc b/src/tool_convert.cc
deleted file mode 100644
index 592fcd7..0000000
--- a/src/tool_convert.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-
-#include "cache.h"
-#include "compatible.h"
-#include "error.h"
-#include "file.h"
-#include "matrix.h"
-#include "operation.h"
-#include "tensor.h"
-#include "tool.h"
-#include "utility.h"
-#include "vector.h"
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <ctype.h>
-#include <unistd.h>
-
-extern cache_t           *cache;
-extern uint              cache_size;
-extern uint              cache_line_size;
-extern bool              emit_latex;
-extern uint              iterations;
-extern char              *tool_name;
-extern tool::type_t      tool_type;
-extern bool              simulate;
-extern bool              verbose;
-extern verbosity::type_t noisiness;
-extern bool              write_results;
-
-void
-convert_tool_usage() 
-{
-  print_tool_banner();
-  message("\nUsage:\n");
-  message("\t%s [options] <input> [output]\n", tool_name);
-  message("\nOptions:\n");
-  message("\t-h\tthis screen\n");
-  message("\t-l\temit LaTeX code as output (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_EMIT_LATEX));
-  message("\t-s\tstrategy (default: %s)\n", strategy_to_string(DEFAULT_STRATEGY));
-  print_strategies("\t\t- %s\n");
-  message("\t-o\torientation (default: %s)\n", orientation_to_string(DEFAULT_ORIENTATION));
-  print_orientations("\t\t- %s\n");
-  message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE));
-  message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max);
-  message("\nExample:\n\n");
-  message("\t$ ./tensor %s -s compressed -o column ieee-fig4.in tensor.out\n", tool_name);
-  message("\tReading ieee-fig4.in ... done [0.000305]\n");
-  message("\tConverting from 'coordinate' to 'compressed-column' ... done [0.000010]\n");
-  message("\tWriting tensor.out ... done [0.000031]\n");
-  exit(1);
-}
-
-tensor_t*
-timed_tensor_convert(tensor_t *source, strategy::type_t strategy, orientation::type_t orientation)
-{
-  precision_timer_t  t;
-  tensor_t *tensor;
-  
-  progress("Converting from '%s' to '%s-%s' ... ",
-	  strategy_to_string(source->strategy),
-	  strategy_to_string(strategy),
-	  orientation_to_string(orientation));
-  timer_start(&t);
-  tensor = tensor_convert(source, strategy, orientation);
-  timer_end(&t);
-  print_elapsed_time(t);
-  
-  return tensor;
-}
-
-void
-convert_tool_main(int argc, char *argv[])
-{
-  int                 c, offset;
-  char                *name;
-  tensor_t            *tensor, *result;
-  strategy::type_t    strategy;
-  orientation::type_t orientation;
-  
-  /* just to be safe, set the tensors to null */
-  tensor = result = NULL;
-  
-  /* set the program's defaults */
-  orientation = DEFAULT_ORIENTATION;
-  strategy    = DEFAULT_STRATEGY;
-  
-  /* we will privide our own error messages */
-  opterr = 0;
-  
-  /* extract any command-line options the user provided */
-  while (-1 != (c = getopt(argc, argv, ":hlo:s:vV:"))) {
-    switch (c) {
-    case 'h': 
-      convert_tool_usage();
-      break;
-    case 'l':
-      emit_latex = !emit_latex;
-      break;
-    case 'o':
-      if (isdigit(optarg[0])) {
-	orientation = (orientation::type_t) atoi(optarg);
-      } else {
-	orientation = string_to_orientation(optarg); 
-      }
-      break;
-    case 's':
-      if (isdigit(optarg[0])) {
-	strategy = (strategy::type_t) atoi(optarg);
-      } else {
-	strategy = string_to_strategy(optarg);
-      }
-      break;
-    case 'v': 
-      verbose = !verbose;
-      break;
-    case 'V':
-      noisiness = (verbosity::type_t) atoi(optarg);
-      if (0 == noisiness) {
-	noisiness = DEFAULT_VERBOSITY;
-      }
-      break;
-    case ':':
-      die("Option -%c requires an operand; that is, an integer or string value.\n", optopt);
-      break;
-    case '?':
-      die("Unknown option: `-%c'\n", optopt);
-      break;
-    default:
-      abort();
-      break;
-    }
-  }
-  
-  if (noisiness > DEFAULT_VERBOSITY) {
-    verbose = true;
-  }
-  
-  /* count the number of remaining arguments */
-  if (argc-optind < 1) {
-    convert_tool_usage();
-  }
-  
-  /* print program options, for debugging purposes */
-  print_tool_options();
-  debug("convert_tool_main: orientation='%s'\n", orientation_to_string(orientation));
-  debug("convert_tool_main: strategy='%s'\n", strategy_to_string(strategy));
-  
-  /* parse the remaining command line options */
-  offset = optind;
-  name   = argv[offset++];
-  tensor = timed_tensor_read(name);
-  debug("main: tensor=0x%x\n", tensor);
-  
-  if (strategy == tensor->strategy) {
-    /* we'll deal with differing orientation when it comes up */
-    result = tensor;
-    tensor = NULL;
-  } else {
-    result = timed_tensor_convert(tensor, strategy, orientation);
-  }
-  debug("main: result=0x%x\n", result);
-  
-  timed_tensor_write(argc, argv, offset, result);
-  
-  tensor_free(result);
-  tensor_free(tensor);
-}
diff --git a/src/tool_permute.cc b/src/tool_permute.cc
deleted file mode 100644
index b3d348b..0000000
--- a/src/tool_permute.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-
-#include "cache.h"
-#include "compatible.h"
-#include "error.h"
-#include "file.h"
-#include "matrix.h"
-#include "memory.h"
-#include "operation.h"
-#include "tensor.h"
-#include "tool.h"
-#include "utility.h"
-#include "vector.h"
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "timer.h"
-#include <ctype.h>
-#include <unistd.h>
-
-extern bool              human_readable;
-extern char              *tool_name;
-extern tool::type_t      tool_type;
-extern bool              tracing;
-extern bool              verbose;
-extern verbosity::type_t noisiness;
-extern bool              write_results;
-
-static permutation_heuristic::type_t heuristic;
-
-void
-permute_tool_usage() 
-{
-  print_tool_banner();
-  message("\nUsage:\n");
-  message("\t%s [options] <input1> <intput2> ... [output]\n", tool_name);
-  message("\nOptions:\n");
-  message("\t-h\tthis screen\n");
-  message("\t-p\tpermutation heuristic (default: %s)\n", permutation_heuristic_to_string(DEFAULT_PERMUTATION_HEURISTIC));
-  print_permutation_heuristics_with_descriptions("\t\t- %s : %s\n");
-  message("\t-t\ttoggle tracing (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_TRACING));
-  message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE));
-  message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max);
-  message("\t-w\twrite results (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_WRITE_RESULTS));
-  message("\nExample:\n\n");
-  message("\t$ ./tensor %s -p naive-minimum tensor.in tensor.out\n", tool_name);
-  message("\tReading vector.in ... done [0.000305]\n");
-  message("\tReading tensor.in ... done [0.000235]\n");
-  message("\tConverting from 'coordinate' to 'compressed-column' ... done [0.000010]\n");
-  message("\tWriting matrix.out ... done [0.000031]\n");
-  exit(1);
-}
-
-tensor_t*
-timed_tensor_permute(tensor_t *tensor)
-{
-  precision_timer_t  t;
-  tensor_t *permuted;
-  
-  progress("Permuting tensor using the '%s' heuristic ... ", 
-	   permutation_heuristic_to_string(heuristic));
-  timer_start(&t);
-  permuted = tensor_permute(tensor, heuristic);
-  timer_end(&t);
-  print_elapsed_time(t);
-  
-  return permuted;
-}
-
-void
-timed_permutation(int argc, char *argv[])
-{
-  int      offset;
-  char     *name;
-  tensor_t *tensor, *permuted;
-  
-  offset = optind;
-  name   = argv[offset++];
-  tensor = timed_tensor_read(name);
-  debug("timed_permutation: tensor=0x%x\n", tensor);
-  
-  if (permutation_heuristic::none == heuristic) {
-    print_elapsed_time(0.0); /* just a no-op */
-  } else {
-    permuted = timed_tensor_permute(tensor);
-    tensor_free(tensor);
-    tensor = permuted;
-  }
-  debug("timed_permutation: permutation=0x%x\n", tensor);
-  
-  /* if we are not printing times for each procedure out in a human
-     consumable way, then we need to terminate the line containing all
-     the timings for this instance */
-  if (!human_readable) {
-    message("\n");
-  }
-  
-  timed_tensor_write(argc, argv, offset, tensor);
-  tensor_free(tensor);
-}
-
-void
-permute_tool_main(int argc, char *argv[])
-{
-  int c;
-  
-  /* set the program's defaults */
-  heuristic = DEFAULT_PERMUTATION_HEURISTIC;
-  
-  /* we will privide our own error messages */
-  opterr = 0;
-  
-  /* extract any command-line options the user provided */
-  while (-1 != (c = getopt(argc, argv, ":hp:tuvV:w"))) {
-    switch (c) {
-    case 'h': 
-      permute_tool_usage();
-      break;
-    case 'p': 
-      if (isdigit(optarg[0])) {
-	heuristic = (permutation_heuristic::type_t) atoi(optarg);
-      } else {
-	heuristic = string_to_permutation_heuristic(optarg);
-      }
-      break;
-    case 't':
-      tracing = !tracing;
-      break;
-    case 'u':
-      human_readable = !human_readable;
-      break;
-    case 'v': 
-      verbose = !verbose;
-      break;
-    case 'V':
-      noisiness = (verbosity::type_t) atoi(optarg);
-      if (0 == noisiness) {
-	noisiness = DEFAULT_VERBOSITY;
-      }
-      break;
-    case 'w':
-      write_results = !write_results;
-      break;
-    case ':':
-      die("Option -%c requires an operand; that is, an integer or string value.\n", optopt);
-      break;
-    case '?':
-      die("Unknown option: `-%c'\n", optopt);
-      break;
-    default:
-      abort();
-      break;
-    }    
-  }
-  
-  if (noisiness > DEFAULT_VERBOSITY) {
-    verbose = true;
-  }
-  
-  /* count the number of remaining arguments */
-  if (argc-optind < 1) {
-    permute_tool_usage();
-  }
-  
-  /* print program options, for debugging purposes */
-  print_tool_options();
-  debug("permute_tool_main: heuristic='%s'\n", permutation_heuristic_to_string(heuristic));
-  
-  /* pass control over to some naive timing procedures */
-  timed_permutation(argc, argv);
-}

From a372446b92806ce62f68c963464848204421e98e Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Wed, 9 Nov 2011 13:56:32 -0700
Subject: [PATCH 28/57] + Removed uneeded tools

---
 src/tool_utility.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/tool_utility.cc b/src/tool_utility.cc
index 7994529..895b451 100644
--- a/src/tool_utility.cc
+++ b/src/tool_utility.cc
@@ -18,10 +18,8 @@ extern verbosity::type_t noisiness;
 extern bool              write_results;
 
 #define TENSOR_DESCRIPTION     "A tool for working with tensors."
-#define CONVERT_DESCRIPTION    "A tool for converting between th-order tensor storage strategies."
 #define GENERATE_DESCRIPTION   "A tool for generating varieties of th-order tensors."
 #define EFFECTUATE_DESCRIPTION "A tool for performing computations on th-order tensors."
-#define PERMUTE_DESCRIPTION    "A tool for permuting the non-zeros of th-order tensors."
 #define VERSION "Version 0.01 (" __DATE__ "), " \
   "Copyright (C) 2011, and GPLv3'd, by Ben Burnett\n" \
   "This is free software; see the source for copying conditions.  There is NO\n" \
@@ -32,19 +30,15 @@ extern bool              write_results;
 static char const *map_tool_to_string[] = { 
   "unknown",
   "tensor",
-  "convert", 
   "generate",
   "effectuate",
-  "permute"
 };
 
 static char const *map_tools_to_description[] = { 
   "unknown",
   TENSOR_DESCRIPTION,
-  CONVERT_DESCRIPTION,
   GENERATE_DESCRIPTION,
   EFFECTUATE_DESCRIPTION,
-  PERMUTE_DESCRIPTION
 };
 
 char const* string_from_tool(tool::type_t tool)

From a6392d3fd45cc3d3edd79ff777d56c72a039b1b2 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Wed, 9 Nov 2011 13:57:53 -0700
Subject: [PATCH 29/57] + Added command line support to specify work
 partitioning scheme for threads

---
 src/main.cc                     | 34 +++++++------
 src/operation_n_mode_product.cc | 88 +++++++++++++++++++++++++++++----
 src/thread.cc                   | 28 ++++++++++-
 src/thread.h                    | 22 +++++++++
 src/tool.h                      |  2 +-
 src/tool_effectuate.cc          | 46 ++++++++++-------
 6 files changed, 174 insertions(+), 46 deletions(-)

diff --git a/src/main.cc b/src/main.cc
index b84b7bb..922276c 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -7,6 +7,7 @@
 #include "operation.h"
 #include "strings.h"
 #include "tensor.h"
+#include "thread.h"
 #include "tool.h"
 #include "utility.h"
 #include "vector.h"
@@ -17,22 +18,23 @@
 #include <ctype.h>
 #include <unistd.h>
 
-cache_t           *cache;
-uint              cache_size;
-uint              cache_line_size;
-uint              iterations;
-uint              memory_stride;
-uint              seed;
-uint              thread_count;
-char              *tool_name;
-tool::type_t      tool_type;
-bool              tracing;
-bool              simulate;
-bool              human_readable;
-bool              verbose;
-verbosity::type_t noisiness;
-bool              write_results;
-bool              emit_latex;
+cache_t                   *cache;
+uint                      cache_size;
+uint                      cache_line_size;
+uint                      iterations;
+uint                      memory_stride;
+thread::partition::type_t thread_partition;
+uint                      seed;
+uint                      thread_count;
+char                      *tool_name;
+tool::type_t              tool_type;
+bool                      tracing;
+bool                      simulate;
+bool                      human_readable;
+bool                      verbose;
+verbosity::type_t         noisiness;
+bool                      write_results;
+bool                      emit_latex;
 
 void
 usage()
diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index c5b1e57..3760bdd 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -11,11 +11,12 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-extern cache_t *cache;
-extern uint    memory_stride;
-extern uint    thread_count;
+extern cache_t			 *cache;
+extern uint			 memory_stride;
+extern uint			 thread_count;
+extern thread::partition::type_t thread_partition;
 
-static pthread_mutex_t tube_lock;
+static pthread_mutex_t           tube_lock;
 
 /*
   Computing ($pT$):
@@ -39,7 +40,7 @@ typedef struct {
 } product_thread_data_t;
 
 int
-traditional_next_tube(product_thread_data_t *data)
+tube_next(product_thread_data_t *data)
 {
   uint k;
   
@@ -50,7 +51,7 @@ traditional_next_tube(product_thread_data_t *data)
 }
 
 thread_address_t
-traditional_fiber_product(thread_argument_t *argument)
+tube_product(thread_argument_t *argument)
 {
   int                   t;
   uint                  i, j, k, offset;
@@ -66,7 +67,7 @@ traditional_fiber_product(thread_argument_t *argument)
   P = data->vector->data;
   T = data->tensor->values;
   
-  while (-1 != (t = traditional_next_tube(data))) {
+  while (-1 != (t = tube_next(data))) {
     sum    = 0;
     offset = t*n;
     i      = t/n;
@@ -80,8 +81,54 @@ traditional_fiber_product(thread_argument_t *argument)
   return NULL;
 }
 
+int
+slice_next(product_thread_data_t *data)
+{
+  uint k;
+  
+  thread_mutex_lock(&tube_lock);
+  k = data->done++;
+  thread_mutex_unlock(&tube_lock);
+  return k < (data->tensor->n) ? k : -1;
+}
+
+thread_address_t
+slice_product(thread_argument_t *argument)
+{
+  int                   i;
+  uint                  j, k;
+  uint                  ioffset, joffset;
+  uint                  n, sum[1000];
+  uint                  *P;
+  double                **M, *T;
+  product_thread_data_t *data;
+  
+  data = (product_thread_data_t*) thread_data(argument);
+  
+  n = data->tensor->n;
+  M = data->matrix->data;
+  P = data->vector->data;
+  T = data->tensor->values;
+  
+  while (-1 != (i = slice_next(data))) {
+    ioffset = i*n*n;
+    for (j = 0; j < n; ++j) {
+      sum[j]  = 0;
+      joffset = ioffset+j*n;
+      for (k = 0; k < n; ++k) {
+	sum[j] += P[k] * T[joffset+k];
+      }
+    }
+    for (j = 0; j < n; ++j) {
+      M[i][j] = sum[j];
+    }
+  }
+  
+  return NULL;
+}
+
 void
-threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor, thread_function_t function)
 {
   product_thread_data_t data;
   
@@ -95,11 +142,32 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
   data.matrix = matrix;
   data.vector = vector;
   data.tensor = tensor;
-  
+
   thread_mutex_init(&tube_lock);
-  thread_fork(thread_count, traditional_fiber_product, &data, NULL);
+  thread_fork(thread_count, slice_product, &data, NULL);
   thread_mutex_destroy(&tube_lock);
 }
+
+void
+threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+{
+  thread_function_t function;
+  
+  switch (thread_partition) {
+  case thread::partition::tube:
+    function = (thread_function_t) &tube_product;
+    break;
+  case thread::partition::slice:
+    function = (thread_function_t) &slice_product;
+    break;
+  default:
+    die("serial_n_mode_product: tensor product for '%s' strategy is not currently supported.\n",
+	strategy_to_string(tensor->strategy));
+    break;
+  }
+  
+  threaded_n_mode_product_array(matrix, vector, tensor, function);
+}
  
 void
 serial_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
diff --git a/src/thread.cc b/src/thread.cc
index eec77dd..18f8ca9 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -5,11 +5,37 @@
  */
 
 #include "thread.h"
-
+#include "utility.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <errno.h>	/* for EBUSY */
 
+static char const *map_thread_partition_to_string[] = { 
+  "unknown",
+  "tube",
+  "slice"
+};
+
+char const*
+thread_partition_to_string(thread::partition::type_t partition)
+{
+  return map_thread_partition_to_string[partition];
+}
+
+thread::partition::type_t
+string_to_thread_partition(char const *name)
+{
+  uint i;
+  
+  for (i = 0; i < COUNT_OF(map_thread_partition_to_string); ++i) {
+    if (0 == strcmp(name, map_thread_partition_to_string[i])) {
+      return (thread::partition::type_t) i;
+    }
+  }
+  
+  return thread::partition::unknown;
+}
+
 /*************************************************
  * attempt to lock a mutex
  */
diff --git a/src/thread.h b/src/thread.h
index 8099434..3d418a6 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -8,6 +8,28 @@
 #ifndef _THREAD_H_
 #define _THREAD_H_
 
+namespace thread {
+  
+  namespace model {
+    typedef enum {
+      unknown,
+      traditional
+    } type_t;
+  }
+  
+  namespace partition {
+    typedef enum {
+      unknown,
+      tube,
+      slice
+    } type_t;
+  }
+  
+}
+
+char const* thread_partition_to_string(thread::partition::type_t partition);
+thread::partition::type_t string_to_thread_partition(char const *name);
+
 /* Linux defs:
  *   _REENTRANT to get thread-safe libs
  *   _POSIX_SOURCE to get POSIX semantics
diff --git a/src/tool.h b/src/tool.h
index a40c1cf..ba641a8 100644
--- a/src/tool.h
+++ b/src/tool.h
@@ -25,11 +25,11 @@ namespace tool {
 #define DEFAULT_MEMORY_STRIDE         32
 #define DEFAULT_OPERATION             operation::n_mode_product
 #define DEFAULT_ORIENTATION           orientation::row
-#define DEFAULT_PERMUTATION_HEURISTIC permutation_heuristic::none
 #define DEFAULT_SIMULATE              false
 #define DEFAULT_STRATEGY              strategy::compressed
 #define DEFAULT_TRACING               false
 #define DEFAULT_THREAD_COUNT          1
+#define DEFAULT_THREAD_PARTITION      thread::partition::tube
 #define DEFAULT_VERBOSE               false
 #define DEFAULT_VERBOSITY             verbosity::low
 #define DEFAULT_WRITE_RESULTS         false
diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc
index 314d116..9119197 100644
--- a/src/tool_effectuate.cc
+++ b/src/tool_effectuate.cc
@@ -7,6 +7,7 @@
 #include "memory.h"
 #include "operation.h"
 #include "tensor.h"
+#include "thread.h"
 #include "tool.h"
 #include "utility.h"
 #include "vector.h"
@@ -18,20 +19,21 @@
 #include <ctype.h>
 #include <unistd.h>
 
-extern cache_t           *cache;
-extern uint              cache_size;
-extern uint              cache_line_size;
-extern bool              human_readable;
-extern uint              iterations;
-extern uint              memory_stride;
-extern uint              thread_count;
-extern char              *tool_name;
-extern tool::type_t      tool_type;
-extern bool              simulate;
-extern bool              tracing;
-extern bool              verbose;
-extern verbosity::type_t noisiness;
-extern bool              write_results;
+extern cache_t			 *cache;
+extern uint			 cache_size;
+extern uint			 cache_line_size;
+extern bool			 human_readable;
+extern uint			 iterations;
+extern uint			 memory_stride;
+extern uint			 thread_count;
+extern thread::partition::type_t thread_partition;
+extern char			 *tool_name;
+extern tool::type_t		 tool_type;
+extern bool			 simulate;
+extern bool			 tracing;
+extern bool			 verbose;
+extern verbosity::type_t	 noisiness;
+extern bool			 write_results;
 
 static operation::type_t optcode;
 
@@ -180,10 +182,10 @@ effectuate_tool_main(int argc, char *argv[])
   int c;
   
   /* set the program's defaults */
-  memory_stride = DEFAULT_MEMORY_STRIDE;
-  optcode       = DEFAULT_OPERATION;
-  thread_count  = DEFAULT_THREAD_COUNT;
-  
+  memory_stride    = DEFAULT_MEMORY_STRIDE;
+  optcode          = DEFAULT_OPERATION;
+  thread_count     = DEFAULT_THREAD_COUNT;
+  thread_partition = DEFAULT_THREAD_PARTITION;
   
   /* we will privide our own error messages */
   opterr = 0;
@@ -219,6 +221,13 @@ effectuate_tool_main(int argc, char *argv[])
 	optcode = string_to_operation(optarg);
       }
       break;
+    case 'p':
+      if (isdigit(optarg[0])) {
+	thread_partition = (thread::partition::type_t) atoi(optarg);
+      } else {
+	thread_partition = string_to_thread_partition(optarg);
+      }
+      break;
     case 'r':
       memory_stride = atoi(optarg);
       if (0 == memory_stride) {
@@ -278,6 +287,7 @@ effectuate_tool_main(int argc, char *argv[])
   debug("effectuate_tool_main: operation='%s'\n", operation_to_string(optcode));
   debug("effectuate_tool_main: memory_stride=%d\n", memory_stride);
   debug("effectuate_tool_main: thread_count=%d\n", thread_count);
+  debug("effectuate_tool_main: thread_partition='%s'\n", thread_partition_to_string(thread_partition));
   
   /* if we are just running a simulation, then we only do one
      iteration; otherwise, it would be really slow */

From 83344743ba85a312461cef89bfb17e8a29ea84a0 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Thu, 10 Nov 2011 20:47:26 -0700
Subject: [PATCH 30/57] + Using atomic increment instead of a mutex locked int
 + Configured the thread_fork call to allow for cpu affinity

---
 src/operation_n_mode_product.cc | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index 3760bdd..0bbedc5 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -16,8 +16,6 @@ extern uint			 memory_stride;
 extern uint			 thread_count;
 extern thread::partition::type_t thread_partition;
 
-static pthread_mutex_t           tube_lock;
-
 /*
   Computing ($pT$):
   Let $\T \in R^{n\times n\times n}$ be a tensor.
@@ -42,11 +40,11 @@ typedef struct {
 int
 tube_next(product_thread_data_t *data)
 {
-  uint k;
+  volatile uint k;
   
-  thread_mutex_lock(&tube_lock);
-  k = data->done++;
-  thread_mutex_unlock(&tube_lock);
+  /* rather than a lock we can take advantage of the architecture and
+     issue an atomic fetch and increment */
+  k = __sync_fetch_and_add(&data->done, 1);
   return k < (data->tensor->n*data->tensor->n) ? k : -1;
 }
 
@@ -55,9 +53,10 @@ tube_product(thread_argument_t *argument)
 {
   int                   t;
   uint                  i, j, k, offset;
-  uint                  n, sum;
+  uint                  n;
   uint                  *P;
   double                **M, *T;
+  double                sum;
   product_thread_data_t *data;
   
   data = (product_thread_data_t*) thread_data(argument);
@@ -84,12 +83,12 @@ tube_product(thread_argument_t *argument)
 int
 slice_next(product_thread_data_t *data)
 {
-  uint k;
+  volatile uint k;
   
-  thread_mutex_lock(&tube_lock);
-  k = data->done++;
-  thread_mutex_unlock(&tube_lock);
-  return k < (data->tensor->n) ? k : -1;
+  /* rather than a lock we can take advantage of the architecture and
+     issue an atomic fetch and increment */
+  k = __sync_fetch_and_add(&data->done, 1);
+  return k < data->tensor->n ? k : -1;
 }
 
 thread_address_t
@@ -98,9 +97,10 @@ slice_product(thread_argument_t *argument)
   int                   i;
   uint                  j, k;
   uint                  ioffset, joffset;
-  uint                  n, sum[1000];
+  uint                  n;
   uint                  *P;
   double                **M, *T;
+  double                sum[1000];
   product_thread_data_t *data;
   
   data = (product_thread_data_t*) thread_data(argument);
@@ -143,9 +143,7 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
   data.vector = vector;
   data.tensor = tensor;
 
-  thread_mutex_init(&tube_lock);
-  thread_fork(thread_count, slice_product, &data, NULL);
-  thread_mutex_destroy(&tube_lock);
+  thread_afork(thread_count, slice_product, &data, NULL);
 }
 
 void

From 0fe863469e0db9cba608cf7e0481e16dd227fd51 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Thu, 10 Nov 2011 20:47:57 -0700
Subject: [PATCH 31/57] + Added cpu affinity configuration for the thread_fork
 procedure

---
 src/thread.cc | 26 ++++++++++++++++++++++----
 src/thread.h  | 25 +++++++++++++++++++++----
 2 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/src/thread.cc b/src/thread.cc
index 18f8ca9..c25ef50 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -10,6 +10,8 @@
 #include <stdlib.h>
 #include <errno.h>	/* for EBUSY */
 
+extern uint thread_count;
+
 static char const *map_thread_partition_to_string[] = { 
   "unknown",
   "tube",
@@ -72,12 +74,15 @@ thread_wait(pthread_t *thread, thread_address_t exitcode)
  * run nthreads threads in the routine start
  */
 void _thread_fork(int nthreads,
-	      thread_function_t start,
-	      thread_address_t arg,
-	      thread_address_t *exitcodes)
+		  thread_function_t start,
+		  thread_address_t arg,
+		  thread_address_t *exitcodes,
+		  int setaffinity)
 {
   int i;
   thread_argument_t *args;
+  pthread_attr_t attr, *pattr;
+  cpu_set_t mask;
   thread_address_t *address;
   
   if (nthreads<1) {
@@ -89,9 +94,22 @@ void _thread_fork(int nthreads,
   for (i=0; i<nthreads; i++) {
     args[i].nthreads=nthreads; args[i].myid=i; args[i].data=arg;
   }
+  pthread_attr_init(&attr);
   for (i=0; i<nthreads; i++) {
-    thread_create(&args[i].self,start,args+i);
+    pattr = NULL;
+#if 0
+    /* for this to work correctly, we need to detect the number of
+       CPUs */
+    if (setaffinity) {
+      CPU_ZERO(&mask);
+      CPU_SET(i%thread_count,&mask);
+      pthread_attr_setaffinity_np(&attr,sizeof(mask),&mask);
+      pattr = &attr;
+    }
+#endif
+    thread_create_with_attr(&args[i].self,pattr,start,args+i);
   }
+  pthread_attr_destroy(&attr);
   for (i=0; i<nthreads; i++) {
     address = (exitcodes==NULL?NULL:exitcodes+i);
     thread_wait(&args[i].self,address);
diff --git a/src/thread.h b/src/thread.h
index 3d418a6..685cd9d 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -70,6 +70,18 @@ typedef void *thread_address_t;
     }							    \
 }
 
+#define thread_create_with_attr(t,attr,start,arg)	\
+{ \
+    int errcode; \
+    \
+    if ((errcode=pthread_create(t, \
+				(attr),			    \
+				(thread_function_t) (start), \
+				(thread_address_t) (arg)))) {	\
+      THREAD_DIE("thread_create_with_attr", errcode);	     \
+    }							    \
+}
+
 #define thread_create_detached(start,arg) \
 { \
     pthread_t t; \
@@ -232,13 +244,18 @@ typedef struct _thread_argument_t_ {
 }
 
 extern void _thread_fork(int nthreads,
-		     thread_function_t start,
-		     thread_address_t arg,
-		     thread_address_t *exitcodes);
+			 thread_function_t start,
+			 thread_address_t arg,
+			 thread_address_t *exitcodes,
+			 int setaffinity);
 
 #define thread_fork(nt,start,arg,codes) \
   _thread_fork(nt,(thread_function_t) start, \
-	   (thread_address_t) arg,(thread_address_t *) codes)
+	       (thread_address_t) arg,(thread_address_t *) codes, 0)
+
+#define thread_afork(nt,start,arg,codes) \
+  _thread_fork(nt,(thread_function_t) start, \
+		(thread_address_t) arg,(thread_address_t *) codes, 1)
 
 /*************************************************
  * the gate struct (rendezvous point)

From 26f36ccc7ca425b313b18a589a14abae542de07c Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Fri, 11 Nov 2011 10:16:10 -0700
Subject: [PATCH 32/57] + Darwin does not support the cpu_set_t structure;
 there is probably an equivalent one that just needs to be found

---
 src/thread.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/thread.cc b/src/thread.cc
index c25ef50..e968b14 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -10,8 +10,6 @@
 #include <stdlib.h>
 #include <errno.h>	/* for EBUSY */
 
-extern uint thread_count;
-
 static char const *map_thread_partition_to_string[] = { 
   "unknown",
   "tube",
@@ -81,9 +79,11 @@ void _thread_fork(int nthreads,
 {
   int i;
   thread_argument_t *args;
+  thread_address_t *address;
   pthread_attr_t attr, *pattr;
+#ifdef __linux__
   cpu_set_t mask;
-  thread_address_t *address;
+#endif
   
   if (nthreads<1) {
     die("thread_mutex_trylock: nthreads<1\n");
@@ -100,14 +100,16 @@ void _thread_fork(int nthreads,
 #if 0
     /* for this to work correctly, we need to detect the number of
        CPUs */
+#ifdef __linux__
     if (setaffinity) {
       CPU_ZERO(&mask);
       CPU_SET(i%thread_count,&mask);
       pthread_attr_setaffinity_np(&attr,sizeof(mask),&mask);
       pattr = &attr;
-    }
 #endif
+    }
     thread_create_with_attr(&args[i].self,pattr,start,args+i);
+#endif
   }
   pthread_attr_destroy(&attr);
   for (i=0; i<nthreads; i++) {

From 89b05f764e63441228bb70c72f96fc470790393c Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Fri, 11 Nov 2011 10:17:43 -0700
Subject: [PATCH 33/57] + Changed thread linking linker line to be
 cross-platform

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index d59896a..15ecac1 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -16,7 +16,7 @@ ifndef SIMULATE
 	EXTRA_DEBUG += -DNOSIMULATE
 endif
 EXTRA_CXXFLAGS=-c $(EXTRA_DEBUG) $(STRICT) $(INCLUDES) $(CPPX11)
-EXTRA_LDFLAGS=-Wall -lpthread $(EXTRA_DEBUG)
+EXTRA_LDFLAGS=-Wall -thread $(EXTRA_DEBUG)
 
 HEADERS_CACHE=address.h cache.h hash.h
 HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h	\

From 2205a00847244e1efba5bfecde080d9f7a7b3bbe Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Fri, 11 Nov 2011 16:21:39 -0700
Subject: [PATCH 34/57] + Returned the vector object back to using doubles vs
 uints

---
 src/vector.h         | 2 +-
 src/vector_malloc.cc | 2 +-
 src/vector_read.cc   | 2 +-
 src/vector_write.cc  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/vector.h b/src/vector.h
index 22b71ac..709aff5 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -8,7 +8,7 @@
 typedef struct {
   uint              n;
   ownership::type_t owner;
-  uint              *data;
+  double            *data;
 } vector_t;
 
 vector_t* vector_malloc(uint n, ownership::type_t owner = ownership::creator);
diff --git a/src/vector_malloc.cc b/src/vector_malloc.cc
index 7b8daef..afa7d4b 100644
--- a/src/vector_malloc.cc
+++ b/src/vector_malloc.cc
@@ -19,7 +19,7 @@ vector_malloc(uint n, ownership::type_t owner)
     return v;
   }
   
-  v->data  = MALLOC_N(uint, n);
+  v->data  = MALLOC_N(double, n);
   
   return v;
 }
diff --git a/src/vector_read.cc b/src/vector_read.cc
index e8d8bb0..d6c377e 100644
--- a/src/vector_read.cc
+++ b/src/vector_read.cc
@@ -20,7 +20,7 @@ vector_read_array(FILE *f)
   v = vector_malloc(n);
 
   for (i = 0; i < v->n; ++i) {
-    fscanf(f, "%u\n", &v->data[i]);
+    fscanf(f, "%lg\n", &v->data[i]);
   }
 
   return v;
diff --git a/src/vector_write.cc b/src/vector_write.cc
index 5907f4c..03b26b2 100644
--- a/src/vector_write.cc
+++ b/src/vector_write.cc
@@ -33,7 +33,7 @@ vector_write_array(FILE *f, vector_t const *v)
   }
 
   for (i = 0; i < v->n; ++i) {
-    fprintf(f, "%d\n", v->data[i]);
+    fprintf(f, "%10.6g\n", v->data[i]);
   }
 }
 

From c9ec3f3fa777f8fe31f6edcedb316f1dab77054d Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Fri, 11 Nov 2011 16:22:40 -0700
Subject: [PATCH 35/57] + Moved all calculation code in to a seperate set of
 source files

---
 src/operation.cc | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/operation.h  |  3 ++
 2 files changed, 83 insertions(+)
 create mode 100644 src/operation.cc

diff --git a/src/operation.cc b/src/operation.cc
new file mode 100644
index 0000000..c6a8fd5
--- /dev/null
+++ b/src/operation.cc
@@ -0,0 +1,80 @@
+
+#include "cache.h"
+#include "compatible.h"
+#include "error.h"
+#include "matrix.h"
+#include "operation.h"
+#include "thread.h"
+#include "tensor.h"
+#include "utility.h"
+#include "vector.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+extern uint thread_count;
+
+void
+threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+{
+  debug("threaded_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
+  
+  compatible(vector, tensor);
+  
+  switch (tensor->strategy) {
+  case strategy::array:
+    threaded_n_mode_product_array(matrix, vector, tensor);
+    break;
+  default:
+    die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n",
+	strategy_to_string(tensor->strategy));
+    break;
+  }
+}
+
+void
+serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+{
+  debug("serial_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
+  
+  compatible(vector, tensor);
+  
+  switch (tensor->strategy) {
+  case strategy::array:
+    /* in this case, we want to compare the single thread version of
+       the same algo against the n-threaded version */
+    threaded_n_mode_product_array(matrix, vector, tensor);
+    break;
+  default:
+    die("serial_n_mode_product: tensor product for '%s' strategy is not currently supported.\n",
+	strategy_to_string(tensor->strategy));
+    break;
+  }
+}
+
+void
+operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
+{
+  debug("operation_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
+  
+  if (thread_count <= 1) {
+    serial_n_mode_product(matrix, vector, tensor);
+  } else {
+    threaded_n_mode_product(matrix, vector, tensor);
+  }
+}
+
+matrix_t*
+operation_n_mode_product(vector_t const *vector, tensor_t const *tensor)
+{
+  matrix_t *matrix;
+  
+  compatible(vector, tensor);
+  debug("operation_n_mode_product(vector=0x%x, tensor=0x%x)\n", vector, tensor);
+  
+  matrix = matrix_malloc(tensor->m, tensor->n, ownership::creator);
+  debug("operation_n_mode_product: matrix=0x%x\n", matrix);
+ 
+  operation_n_mode_product(matrix, vector, tensor);
+  
+  return matrix;
+}
diff --git a/src/operation.h b/src/operation.h
index 0542523..d8f6bce 100644
--- a/src/operation.h
+++ b/src/operation.h
@@ -20,6 +20,9 @@ operation::type_t string_to_operation(char const *name);
 void operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor);
 matrix_t *operation_n_mode_product(vector_t const *vector, tensor_t const *tensor);
 
+void serial_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor);
+void threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor);
+
 #endif
 
 /*

From b5f9a2f7e2f05301d8e1c1402677eb852625a93c Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Fri, 11 Nov 2011 16:23:40 -0700
Subject: [PATCH 36/57] + Darwin does not support the same affinity settings as
 Linux; I'll return to them when it seems nessesary

---
 src/thread.cc | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/src/thread.cc b/src/thread.cc
index e968b14..f6bcd5d 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -71,6 +71,36 @@ thread_wait(pthread_t *thread, thread_address_t exitcode)
 /*************************************************
  * run nthreads threads in the routine start
  */
+void _thread_fork(int nthreads,
+		  thread_function_t start,
+		  thread_address_t arg,
+		  thread_address_t *exitcodes,
+		  int setaffinity)
+{
+  int i;
+  thread_argument_t *args;
+  thread_address_t *address;
+  
+  if (nthreads<1) {
+    die("thread_mutex_trylock: nthreads<1\n");
+  }
+  if ((args=(thread_argument_t *) malloc(nthreads*sizeof(thread_argument_t)))==NULL) {
+    die("thread_fork: malloc failed!\n");
+  }
+  for (i=0; i<nthreads; i++) {
+    args[i].nthreads=nthreads; args[i].myid=i; args[i].data=arg;
+  }
+  for (i=0; i<nthreads; i++) {
+    thread_create(&args[i].self,start,args+i);
+  }
+  for (i=0; i<nthreads; i++) {
+    address = (exitcodes==NULL?NULL:exitcodes+i);
+    thread_wait(&args[i].self,address);
+  }
+  free(args);
+}
+
+#if 0
 void _thread_fork(int nthreads,
 		  thread_function_t start,
 		  thread_address_t arg,
@@ -118,6 +148,7 @@ void _thread_fork(int nthreads,
   }
   free(args);
 }
+#endif
 
 /*************************************************
  * initialize a gate

From 5bcf8ca49745df248dd9bdc1ff4db0a8af5197fa Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Fri, 11 Nov 2011 16:24:27 -0700
Subject: [PATCH 37/57] + Added Darwin support for BLAS routines

---
 src/Makefile                    |  15 +++--
 src/operation_n_mode_product.cc | 115 ++++++--------------------------
 2 files changed, 31 insertions(+), 99 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 15ecac1..9c27480 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,10 +1,11 @@
-CXX=g++
-SYMBOLS=echo
 OS=`uname`
-ifeq ($(OS), Darwin)
+ifeq "$(OS)" "Darwin"
 	CXX=clang
 	CPPX11=-std=c++0x -stdlib=libc++
 	SYMBOLS=dsymutil
+else
+	CXX=g++
+	SYMBOLS=echo
 endif
 INCLUDES=-I.
 STRICT=-pedantic -Wall -Wno-variadic-macros
@@ -15,8 +16,10 @@ endif
 ifndef SIMULATE
 	EXTRA_DEBUG += -DNOSIMULATE
 endif
-EXTRA_CXXFLAGS=-c $(EXTRA_DEBUG) $(STRICT) $(INCLUDES) $(CPPX11)
-EXTRA_LDFLAGS=-Wall -thread $(EXTRA_DEBUG)
+EXTRA_CXXFLAGS=-c -DYA_BLAS -DYA_LAPACK -DYA_BLASMULT $(EXTRA_DEBUG)	\
+	$(STRICT) $(INCLUDES) $(CPPX11)
+EXTRA_LDFLAGS=-Wall -thread -lblas -llapack -framework Accelerate	\
+	$(EXTRA_DEBUG)
 
 HEADERS_CACHE=address.h cache.h hash.h
 HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h	\
@@ -31,7 +34,7 @@ HEADERS=$(HEADERS_CACHE) $(HEADERS_GENERAL) $(HEADERS_GENERATE)	\
 
 SOURCES_CACHE=address.cc cache.cc hash.cc
 SOURCES_GENERAL=arithmetic.cc compatible.cc error.cc file.cc		\
-	information.cc latex.cc memory.cc mmio.cc			\
+	information.cc latex.cc memory.cc mmio.cc operation.cc		\
 	operation_n_mode_product.cc operation_utility.cc random.cc	\
 	strings.cc thread.cc timer.cc tool_effectuate.cc		\
 	tool_generate.cc tool_timing.cc tool_utility.cc types.cc	\
diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index 0bbedc5..cdc009a 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -10,6 +10,11 @@
 #include "vector.h"
 #include <stdio.h>
 #include <stdlib.h>
+#ifdef __APPLE__
+#include <Accelerate/Accelerate.h>
+#else
+#include <cblas.h>
+#endif
 
 extern cache_t			 *cache;
 extern uint			 memory_stride;
@@ -52,11 +57,9 @@ thread_address_t
 tube_product(thread_argument_t *argument)
 {
   int                   t;
-  uint                  i, j, k, offset;
+  uint                  i, j, offset;
   uint                  n;
-  uint                  *P;
-  double                **M, *T;
-  double                sum;
+  double                **M, *T, *P;
   product_thread_data_t *data;
   
   data = (product_thread_data_t*) thread_data(argument);
@@ -67,14 +70,10 @@ tube_product(thread_argument_t *argument)
   T = data->tensor->values;
   
   while (-1 != (t = tube_next(data))) {
-    sum    = 0;
-    offset = t*n;
-    i      = t/n;
-    j      = t%n;
-    for (k = 0; k < n; ++k) {
-      sum += P[k] * T[offset+k];
-    }
-    M[i][j] = sum;
+    offset  = t*n;
+    i       = t/n;
+    j       = t%n;
+    M[i][j] = cblas_ddot(n, P, 1, T+offset, 1);
   }
   
   return NULL;
@@ -95,12 +94,10 @@ thread_address_t
 slice_product(thread_argument_t *argument)
 {
   int                   i;
-  uint                  j, k;
+  uint                  j;
   uint                  ioffset, joffset;
   uint                  n;
-  uint                  *P;
-  double                **M, *T;
-  double                sum[1000];
+  double                **M, *T, *P;
   product_thread_data_t *data;
   
   data = (product_thread_data_t*) thread_data(argument);
@@ -113,14 +110,8 @@ slice_product(thread_argument_t *argument)
   while (-1 != (i = slice_next(data))) {
     ioffset = i*n*n;
     for (j = 0; j < n; ++j) {
-      sum[j]  = 0;
       joffset = ioffset+j*n;
-      for (k = 0; k < n; ++k) {
-	sum[j] += P[k] * T[joffset+k];
-      }
-    }
-    for (j = 0; j < n; ++j) {
-      M[i][j] = sum[j];
+      M[i][j] = cblas_ddot(n, P, 1, T+joffset, 1);
     }
   }
   
@@ -142,8 +133,8 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
   data.matrix = matrix;
   data.vector = vector;
   data.tensor = tensor;
-
-  thread_afork(thread_count, slice_product, &data, NULL);
+  
+  thread_afork(thread_count, function, &data, NULL);
 }
 
 void
@@ -170,10 +161,10 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
 void
 serial_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
-  uint   i, j, k, index;
+  uint   i, j, k;
+  uint   index, sum;
   uint   n;
-  uint   *P;
-  double **M, *T;
+  double **M, *T, *P;
   
   debug("n_mode_product_array(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
   
@@ -184,74 +175,12 @@ serial_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t c
   
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n; ++j) {
+      sum = 0;
       for (k = 0; k < n; ++k) {
 	index = tensor_index(tensor, i, j, k);
-	M[i][j] += P[k] * T[index];
+	sum += P[k] * T[index];
       }
+      M[i][j] = sum;
     }
   }
 }
-
-void
-threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
-{
-  debug("threaded_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
-  
-  compatible(vector, tensor);
-  
-  switch (tensor->strategy) {
-  case strategy::array:
-    threaded_n_mode_product_array(matrix, vector, tensor);
-    break;
-  default:
-    die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n",
-	strategy_to_string(tensor->strategy));
-    break;
-  }
-}
-
-void
-serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
-{
-  debug("serial_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
-  
-  compatible(vector, tensor);
-  
-  switch (tensor->strategy) {
-  case strategy::array:
-    serial_n_mode_product_array(matrix, vector, tensor);
-    break;
-  default:
-    die("serial_n_mode_product: tensor product for '%s' strategy is not currently supported.\n",
-	strategy_to_string(tensor->strategy));
-    break;
-  }
-}
-
-void
-operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
-{
-  debug("operation_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor);
-  
-  if (thread_count <= 1) {
-    serial_n_mode_product(matrix, vector, tensor);
-  } else {
-    threaded_n_mode_product(matrix, vector, tensor);
-  }
-}
-
-matrix_t*
-operation_n_mode_product(vector_t const *vector, tensor_t const *tensor)
-{
-  matrix_t *matrix;
-  
-  compatible(vector, tensor);
-  debug("operation_n_mode_product(vector=0x%x, tensor=0x%x)\n", vector, tensor);
-  
-  matrix = matrix_malloc(tensor->m, tensor->n, ownership::creator);
-  debug("operation_n_mode_product: matrix=0x%x\n", matrix);
- 
-  operation_n_mode_product(matrix, vector, tensor);
-  
-  return matrix;
-}

From c499f44d0f1c840e039ffde44c46aa9a1392b2cb Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Fri, 11 Nov 2011 18:26:56 -0700
Subject: [PATCH 38/57] + Split lin-alg routines off in to a new set of files,
 to test other libraries agains BLAS and hand-tuned code

---
 src/Makefile                    | 20 ++++++++++++--------
 src/algebra.cc                  | 15 +++++++++++++++
 src/algebra.h                   | 13 +++++++++++++
 src/operation_n_mode_product.cc | 10 +++-------
 4 files changed, 43 insertions(+), 15 deletions(-)
 create mode 100644 src/algebra.cc
 create mode 100644 src/algebra.h

diff --git a/src/Makefile b/src/Makefile
index 9c27480..d47ab7f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -9,6 +9,7 @@ else
 endif
 INCLUDES=-I.
 STRICT=-pedantic -Wall -Wno-variadic-macros
+
 EXTRA_DEBUG=-g
 ifndef DEBUG
 	EXTRA_DEBUG += -DNODEBUG
@@ -16,15 +17,18 @@ endif
 ifndef SIMULATE
 	EXTRA_DEBUG += -DNOSIMULATE
 endif
+
+EXTRA_LDFLAGS=-Wall -thread -lblas -llapack $(EXTRA_DEBUG)
 EXTRA_CXXFLAGS=-c -DYA_BLAS -DYA_LAPACK -DYA_BLASMULT $(EXTRA_DEBUG)	\
-	$(STRICT) $(INCLUDES) $(CPPX11)
-EXTRA_LDFLAGS=-Wall -thread -lblas -llapack -framework Accelerate	\
-	$(EXTRA_DEBUG)
+		$(STRICT) $(INCLUDES) $(CPPX11)
+ifeq "$(OS)" "Darwin"
+	EXTRA_LDFLAGS += -framework Accelerate
+endif
 
 HEADERS_CACHE=address.h cache.h hash.h
 HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h	\
-	memory.h operation.h random.h thread.h strings.h timer.h	\
-	tool.h utility.h compatible.h
+	algebra.h memory.h operation.h random.h thread.h strings.h	\
+	timer.h tool.h utility.h compatible.h
 HEADERS_GENERATE=generate.h
 HEADERS_MATRIX=matrix.h mmio.h
 HEADERS_TENSOR=tensor.h
@@ -34,9 +38,9 @@ HEADERS=$(HEADERS_CACHE) $(HEADERS_GENERAL) $(HEADERS_GENERATE)	\
 
 SOURCES_CACHE=address.cc cache.cc hash.cc
 SOURCES_GENERAL=arithmetic.cc compatible.cc error.cc file.cc		\
-	information.cc latex.cc memory.cc mmio.cc operation.cc		\
-	operation_n_mode_product.cc operation_utility.cc random.cc	\
-	strings.cc thread.cc timer.cc tool_effectuate.cc		\
+	information.cc latex.cc algebra.cc memory.cc mmio.cc		\
+	operation.cc operation_n_mode_product.cc operation_utility.cc	\
+	random.cc strings.cc thread.cc timer.cc tool_effectuate.cc	\
 	tool_generate.cc tool_timing.cc tool_utility.cc types.cc	\
 	utility.cc
 SOURCES_GENERATE=generate_tensor_from_matrix.cc
diff --git a/src/algebra.cc b/src/algebra.cc
new file mode 100644
index 0000000..d3a98d5
--- /dev/null
+++ b/src/algebra.cc
@@ -0,0 +1,15 @@
+
+#include "algebra.h"
+#include "error.h"
+
+#ifdef __APPLE__
+#include <Accelerate/Accelerate.h>
+#else
+#include <cblas.h>
+#endif
+
+double
+array_inner_product(const int N, const double *X, const int incX, const double *Y, const int incY)
+{
+  return cblas_ddot(N, X, incX, Y, incY);
+}
diff --git a/src/algebra.h b/src/algebra.h
new file mode 100644
index 0000000..4a8e750
--- /dev/null
+++ b/src/algebra.h
@@ -0,0 +1,13 @@
+
+#ifndef _ARRAY_MATH_H_
+#define _ARRAY_MATH_H_
+
+double array_inner_product(const int N, const double *X, const int incX, const double *Y, const int incY);
+
+#endif
+
+/*
+  Local Variables:
+  mode: C++
+  End:
+*/
diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index cdc009a..8fbab24 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -1,4 +1,5 @@
 
+#include "algebra.h"
 #include "cache.h"
 #include "compatible.h"
 #include "error.h"
@@ -10,11 +11,6 @@
 #include "vector.h"
 #include <stdio.h>
 #include <stdlib.h>
-#ifdef __APPLE__
-#include <Accelerate/Accelerate.h>
-#else
-#include <cblas.h>
-#endif
 
 extern cache_t			 *cache;
 extern uint			 memory_stride;
@@ -73,7 +69,7 @@ tube_product(thread_argument_t *argument)
     offset  = t*n;
     i       = t/n;
     j       = t%n;
-    M[i][j] = cblas_ddot(n, P, 1, T+offset, 1);
+    M[i][j] = array_inner_product(n, P, 1, T+offset, 1);
   }
   
   return NULL;
@@ -111,7 +107,7 @@ slice_product(thread_argument_t *argument)
     ioffset = i*n*n;
     for (j = 0; j < n; ++j) {
       joffset = ioffset+j*n;
-      M[i][j] = cblas_ddot(n, P, 1, T+joffset, 1);
+      M[i][j] = array_inner_product(n, P, 1, T+joffset, 1);
     }
   }
   

From 914cb463f760207846484dcaa766ffddbe000ac5 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 06:22:19 -0700
Subject: [PATCH 39/57] + Split support for Darwin and Linux

---
 src/algebra.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/algebra.cc b/src/algebra.cc
index d3a98d5..242db80 100644
--- a/src/algebra.cc
+++ b/src/algebra.cc
@@ -4,7 +4,9 @@
 
 #ifdef __APPLE__
 #include <Accelerate/Accelerate.h>
-#else
+#endif
+
+#ifdef __linux__
 #include <cblas.h>
 #endif
 

From b22ae63a60c5929b11f34fb344831862e248047b Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 06:24:24 -0700
Subject: [PATCH 40/57] + Added left/right operand association split

---
 src/main.cc                     |   1 +
 src/operation_n_mode_product.cc | 103 +++++++++++++++++++++++---------
 src/tool_effectuate.cc          |  20 +++++--
 3 files changed, 92 insertions(+), 32 deletions(-)

diff --git a/src/main.cc b/src/main.cc
index 922276c..5e7dd16 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -18,6 +18,7 @@
 #include <ctype.h>
 #include <unistd.h>
 
+association::type_t       operand_association;
 cache_t                   *cache;
 uint                      cache_size;
 uint                      cache_line_size;
diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index 8fbab24..3d259e3 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+extern association::type_t       operand_association;
 extern cache_t			 *cache;
 extern uint			 memory_stride;
 extern uint			 thread_count;
@@ -29,6 +30,18 @@ extern thread::partition::type_t thread_partition;
       end for
     end for
   end for
+  
+  Computing ($Tp$):
+  Let $\T \in R^{n\times n\times n}$ be a tensor.
+  Let $\M \in R^{n\times n}$ be a matrix.
+  Let $p \in R^{n}$ be a vector.
+  for i = 1 to l do
+    for j = 1 to m do 
+      for k = 1 to m do
+        M[i][j] += T[j][i][k] * p[k]
+      end for
+    end for
+  end for
 */
 
 typedef struct {
@@ -49,21 +62,25 @@ tube_next(product_thread_data_t *data)
   return k < (data->tensor->n*data->tensor->n) ? k : -1;
 }
 
-thread_address_t
-tube_product(thread_argument_t *argument)
+void
+tube_product_pT(product_thread_data_t *data, uint n, double **M, double *P, double *T)
 {
-  int                   t;
-  uint                  i, j, offset;
-  uint                  n;
-  double                **M, *T, *P;
-  product_thread_data_t *data;
-  
-  data = (product_thread_data_t*) thread_data(argument);
+  int  t;
+  uint i, j, offset;
   
-  n = data->tensor->n;
-  M = data->matrix->data;
-  P = data->vector->data;
-  T = data->tensor->values;
+  while (-1 != (t = tube_next(data))) {
+    offset  = t*n;
+    i       = t/n;
+    j       = t%n;
+    M[i][j] = array_inner_product(n, P, 1, T+offset, 1);
+  }
+}
+
+void
+tube_product_Tp(product_thread_data_t *data, uint n, double **M, double *P, double *T)
+{
+  int  t;
+  uint i, j, offset;
   
   while (-1 != (t = tube_next(data))) {
     offset  = t*n;
@@ -71,6 +88,20 @@ tube_product(thread_argument_t *argument)
     j       = t%n;
     M[i][j] = array_inner_product(n, P, 1, T+offset, 1);
   }
+}
+
+thread_address_t
+tube_product(thread_argument_t *argument)
+{
+  product_thread_data_t *data;
+  
+  data = (product_thread_data_t*) thread_data(argument);
+  
+  if (association::left == operand_association) {
+    tube_product_pT(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
+  } else {
+    tube_product_Tp(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
+  }
   
   return NULL;
 }
@@ -86,22 +117,26 @@ slice_next(product_thread_data_t *data)
   return k < data->tensor->n ? k : -1;
 }
 
-thread_address_t
-slice_product(thread_argument_t *argument)
+void
+slice_product_pT(product_thread_data_t *data, uint n, double **M, double *P, double *T)
 {
-  int                   i;
-  uint                  j;
-  uint                  ioffset, joffset;
-  uint                  n;
-  double                **M, *T, *P;
-  product_thread_data_t *data;
+  int  i;
+  uint j, ioffset, joffset;
   
-  data = (product_thread_data_t*) thread_data(argument);
-  
-  n = data->tensor->n;
-  M = data->matrix->data;
-  P = data->vector->data;
-  T = data->tensor->values;
+  while (-1 != (i = slice_next(data))) {
+    ioffset = i*n*n;
+    for (j = 0; j < n; ++j) {
+      joffset = ioffset+j*n;
+      M[i][j] = array_inner_product(n, P, 1, T+joffset, 1);
+    }
+  }
+}
+
+void
+slice_product_Tp(product_thread_data_t *data, uint n, double **M, double *P, double *T)
+{
+  int  i;
+  uint j, ioffset, joffset;
   
   while (-1 != (i = slice_next(data))) {
     ioffset = i*n*n;
@@ -110,6 +145,20 @@ slice_product(thread_argument_t *argument)
       M[i][j] = array_inner_product(n, P, 1, T+joffset, 1);
     }
   }
+}
+
+thread_address_t
+slice_product(thread_argument_t *argument)
+{
+  product_thread_data_t *data;
+  
+  data = (product_thread_data_t*) thread_data(argument);
+  
+  if (association::left == operand_association) {
+    slice_product_pT(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
+  } else {
+    slice_product_Tp(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
+  }
   
   return NULL;
 }
diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc
index 9119197..6db2945 100644
--- a/src/tool_effectuate.cc
+++ b/src/tool_effectuate.cc
@@ -19,6 +19,7 @@
 #include <ctype.h>
 #include <unistd.h>
 
+extern association::type_t       operand_association;
 extern cache_t			 *cache;
 extern uint			 cache_size;
 extern uint			 cache_line_size;
@@ -182,17 +183,25 @@ effectuate_tool_main(int argc, char *argv[])
   int c;
   
   /* set the program's defaults */
-  memory_stride    = DEFAULT_MEMORY_STRIDE;
-  optcode          = DEFAULT_OPERATION;
-  thread_count     = DEFAULT_THREAD_COUNT;
-  thread_partition = DEFAULT_THREAD_PARTITION;
+  operand_association = DEFAULT_ASSOCIATION;
+  memory_stride       = DEFAULT_MEMORY_STRIDE;
+  optcode             = DEFAULT_OPERATION;
+  thread_count        = DEFAULT_THREAD_COUNT;
+  thread_partition    = DEFAULT_THREAD_PARTITION;
   
   /* we will privide our own error messages */
   opterr = 0;
   
   /* extract any command-line options the user provided */
-  while (-1 != (c = getopt(argc, argv, ":hl:m:n:o:p:r:st:TuvV:w"))) {
+  while (-1 != (c = getopt(argc, argv, ":a:hl:m:n:o:p:r:st:TuvV:w"))) {
     switch (c) {
+    case 'a':
+      if (isdigit(optarg[0])) {
+	operand_association = (association::type_t) atoi(optarg);
+      } else {
+	operand_association = string_to_association(optarg);
+      }
+      break;
     case 'h': 
       effectuate_tool_usage();
       break;
@@ -284,6 +293,7 @@ effectuate_tool_main(int argc, char *argv[])
   
   /* print program options, for debugging purposes */
   print_tool_options();
+  debug("effectuate_tool_main: operand_association='%s'\n", association_to_string(operand_association));
   debug("effectuate_tool_main: operation='%s'\n", operation_to_string(optcode));
   debug("effectuate_tool_main: memory_stride=%d\n", memory_stride);
   debug("effectuate_tool_main: thread_count=%d\n", thread_count);

From 08f113b2819f4e175d8f68755e0b95df1c2f1562 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 06:25:02 -0700
Subject: [PATCH 41/57] + Added Octave/Matlab output test

---
 src/matrix_write.cc | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/matrix_write.cc b/src/matrix_write.cc
index d8cc7f2..f1b2fa1 100644
--- a/src/matrix_write.cc
+++ b/src/matrix_write.cc
@@ -16,6 +16,31 @@ matrix_initialize_type(MM_typecode *type)
   mm_set_real(type);
 }
 
+#if 0
+/* Matlab/Octave format */
+void printmat(int N, int M, double *A, int LDA)
+{
+    int i, j;
+    double mtmp;
+
+    printf("[ ");
+    for (i = 0; i < N; i++) {
+	printf("[ ");
+	for (j = 0; j < M; j++) {
+	    mtmp = A[i + j * LDA];
+	    printf("%5.2e", mtmp);
+	    if (j < M - 1)
+		printf(", ");
+	}
+	if (i < N - 1)
+	    printf("]; ");
+	else
+	    printf("] ");
+    }
+    printf("]");
+}
+#endif
+
 void
 matrix_fwrite_array(FILE *file, matrix_t const *matrix)
 {

From afeb7df3b9c62262a1f1b10274f98a05917cf100 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 06:25:52 -0700
Subject: [PATCH 42/57] + Added default operand association

---
 src/tool.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tool.h b/src/tool.h
index ba641a8..33eeb7c 100644
--- a/src/tool.h
+++ b/src/tool.h
@@ -20,6 +20,7 @@ namespace tool {
 #define OPTION_MESSAGE(x,a,b)   (x ? a:b)
 #define DEFAULT_ON_OR_OFF(x)    OPTION_MESSAGE(x, "on", "off")
 
+#define DEFAULT_ASSOCIATION           association::left
 #define DEFAULT_HUMAN_READABLE        true
 #define DEFAULT_ITERATIONS            1
 #define DEFAULT_MEMORY_STRIDE         32

From d8a49fe87d89d1a90c2210a9bbce8491425d1dbf Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 06:27:25 -0700
Subject: [PATCH 43/57] + Conversion between operand association string and
 enum

---
 src/operation.h          | 10 ++++++++++
 src/operation_utility.cc | 26 ++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/src/operation.h b/src/operation.h
index d8f6bce..8576ebb 100644
--- a/src/operation.h
+++ b/src/operation.h
@@ -13,8 +13,18 @@ namespace operation {
   } type_t;
 }
 
+namespace association {
+  typedef enum {
+    unknown,
+    left,
+    right
+  } type_t;
+}
+
+char const* association_to_string(association::type_t association);
 char const* operation_to_string(operation::type_t operation);
 char const* operation_to_description_string(operation::type_t operation);
+association::type_t string_to_association(char const *name);
 operation::type_t string_to_operation(char const *name);
 
 void operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor);
diff --git a/src/operation_utility.cc b/src/operation_utility.cc
index e865afc..2c7dd4d 100644
--- a/src/operation_utility.cc
+++ b/src/operation_utility.cc
@@ -60,3 +60,29 @@ print_operations_with_descriptions(char const *format)
     message(format, map_operations_to_string[i], map_operations_to_description[i]);
   }
 }
+
+static char const *map_associations_to_string[] = { 
+  "unknown",
+  "left",
+  "right"
+};
+
+char const*
+association_to_string(association::type_t association)
+{
+  return map_associations_to_string[association];
+}
+
+association::type_t
+string_to_association(char const *name)
+{
+  uint i;
+  
+  for (i = 0; i < COUNT_OF(map_associations_to_string); ++i) {
+    if (0 == strcmp(name, map_associations_to_string[i])) {
+      return (association::type_t) i;
+    }
+  }
+  
+  return association::unknown;
+}

From 06b41c89e8fcf5fd484b99a90581b358ec176bbe Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 10:58:30 -0700
Subject: [PATCH 44/57] + Added UI support for tensor storage orientation and
 storage strategy specification

---
 src/main.cc            |   2 +
 src/thread.cc          |  26 +++++++
 src/thread.h           |   2 +
 src/tool.h             |   2 +-
 src/tool_convert.cc    | 169 +++++++++++++++++++++++++++++++++++++++++
 src/tool_effectuate.cc |  35 +++++++--
 6 files changed, 230 insertions(+), 6 deletions(-)
 create mode 100644 src/tool_convert.cc

diff --git a/src/main.cc b/src/main.cc
index 5e7dd16..9fc3efa 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -26,6 +26,8 @@ uint                      iterations;
 uint                      memory_stride;
 thread::partition::type_t thread_partition;
 uint                      seed;
+orientation::type_t       storage_orientation;
+strategy::type_t          storage_strategy;
 uint                      thread_count;
 char                      *tool_name;
 tool::type_t              tool_type;
diff --git a/src/thread.cc b/src/thread.cc
index f6bcd5d..186e844 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -16,6 +16,12 @@ static char const *map_thread_partition_to_string[] = {
   "slice"
 };
 
+static char const *map_thread_partition_to_description[] = { 
+  "unknown",
+  "tube per thread",
+  "slice per thread"
+};
+
 char const*
 thread_partition_to_string(thread::partition::type_t partition)
 {
@@ -36,6 +42,26 @@ string_to_thread_partition(char const *name)
   return thread::partition::unknown;
 }
 
+void
+print_thread_partitions(char const *format)
+{
+  uint i;
+  
+  for (i = 1; i < COUNT_OF(map_thread_partition_to_string); ++i) {
+    message(format, map_thread_partition_to_string[i]);
+  }
+}
+
+void
+print_thread_partitions_with_descriptions(char const *format)
+{
+  uint i;
+  
+  for (i = 1; i < COUNT_OF(map_thread_partition_to_string); ++i) {
+    message(format, map_thread_partition_to_string[i], map_thread_partition_to_description[i]);
+  }
+}
+
 /*************************************************
  * attempt to lock a mutex
  */
diff --git a/src/thread.h b/src/thread.h
index 685cd9d..ace57b2 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -29,6 +29,8 @@ namespace thread {
 
 char const* thread_partition_to_string(thread::partition::type_t partition);
 thread::partition::type_t string_to_thread_partition(char const *name);
+void print_thread_partitions(char const *format);
+void print_thread_partitions_with_descriptions(char const *format);
 
 /* Linux defs:
  *   _REENTRANT to get thread-safe libs
diff --git a/src/tool.h b/src/tool.h
index 33eeb7c..9946bbe 100644
--- a/src/tool.h
+++ b/src/tool.h
@@ -27,7 +27,7 @@ namespace tool {
 #define DEFAULT_OPERATION             operation::n_mode_product
 #define DEFAULT_ORIENTATION           orientation::row
 #define DEFAULT_SIMULATE              false
-#define DEFAULT_STRATEGY              strategy::compressed
+#define DEFAULT_STRATEGY              strategy::array
 #define DEFAULT_TRACING               false
 #define DEFAULT_THREAD_COUNT          1
 #define DEFAULT_THREAD_PARTITION      thread::partition::tube
diff --git a/src/tool_convert.cc b/src/tool_convert.cc
new file mode 100644
index 0000000..592fcd7
--- /dev/null
+++ b/src/tool_convert.cc
@@ -0,0 +1,169 @@
+
+#include "cache.h"
+#include "compatible.h"
+#include "error.h"
+#include "file.h"
+#include "matrix.h"
+#include "operation.h"
+#include "tensor.h"
+#include "tool.h"
+#include "utility.h"
+#include "vector.h"
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <ctype.h>
+#include <unistd.h>
+
+extern cache_t           *cache;
+extern uint              cache_size;
+extern uint              cache_line_size;
+extern bool              emit_latex;
+extern uint              iterations;
+extern char              *tool_name;
+extern tool::type_t      tool_type;
+extern bool              simulate;
+extern bool              verbose;
+extern verbosity::type_t noisiness;
+extern bool              write_results;
+
+void
+convert_tool_usage() 
+{
+  print_tool_banner();
+  message("\nUsage:\n");
+  message("\t%s [options] <input> [output]\n", tool_name);
+  message("\nOptions:\n");
+  message("\t-h\tthis screen\n");
+  message("\t-l\temit LaTeX code as output (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_EMIT_LATEX));
+  message("\t-s\tstrategy (default: %s)\n", strategy_to_string(DEFAULT_STRATEGY));
+  print_strategies("\t\t- %s\n");
+  message("\t-o\torientation (default: %s)\n", orientation_to_string(DEFAULT_ORIENTATION));
+  print_orientations("\t\t- %s\n");
+  message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE));
+  message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max);
+  message("\nExample:\n\n");
+  message("\t$ ./tensor %s -s compressed -o column ieee-fig4.in tensor.out\n", tool_name);
+  message("\tReading ieee-fig4.in ... done [0.000305]\n");
+  message("\tConverting from 'coordinate' to 'compressed-column' ... done [0.000010]\n");
+  message("\tWriting tensor.out ... done [0.000031]\n");
+  exit(1);
+}
+
+tensor_t*
+timed_tensor_convert(tensor_t *source, strategy::type_t strategy, orientation::type_t orientation)
+{
+  precision_timer_t  t;
+  tensor_t *tensor;
+  
+  progress("Converting from '%s' to '%s-%s' ... ",
+	  strategy_to_string(source->strategy),
+	  strategy_to_string(strategy),
+	  orientation_to_string(orientation));
+  timer_start(&t);
+  tensor = tensor_convert(source, strategy, orientation);
+  timer_end(&t);
+  print_elapsed_time(t);
+  
+  return tensor;
+}
+
+void
+convert_tool_main(int argc, char *argv[])
+{
+  int                 c, offset;
+  char                *name;
+  tensor_t            *tensor, *result;
+  strategy::type_t    strategy;
+  orientation::type_t orientation;
+  
+  /* just to be safe, set the tensors to null */
+  tensor = result = NULL;
+  
+  /* set the program's defaults */
+  orientation = DEFAULT_ORIENTATION;
+  strategy    = DEFAULT_STRATEGY;
+  
+  /* we will privide our own error messages */
+  opterr = 0;
+  
+  /* extract any command-line options the user provided */
+  while (-1 != (c = getopt(argc, argv, ":hlo:s:vV:"))) {
+    switch (c) {
+    case 'h': 
+      convert_tool_usage();
+      break;
+    case 'l':
+      emit_latex = !emit_latex;
+      break;
+    case 'o':
+      if (isdigit(optarg[0])) {
+	orientation = (orientation::type_t) atoi(optarg);
+      } else {
+	orientation = string_to_orientation(optarg); 
+      }
+      break;
+    case 's':
+      if (isdigit(optarg[0])) {
+	strategy = (strategy::type_t) atoi(optarg);
+      } else {
+	strategy = string_to_strategy(optarg);
+      }
+      break;
+    case 'v': 
+      verbose = !verbose;
+      break;
+    case 'V':
+      noisiness = (verbosity::type_t) atoi(optarg);
+      if (0 == noisiness) {
+	noisiness = DEFAULT_VERBOSITY;
+      }
+      break;
+    case ':':
+      die("Option -%c requires an operand; that is, an integer or string value.\n", optopt);
+      break;
+    case '?':
+      die("Unknown option: `-%c'\n", optopt);
+      break;
+    default:
+      abort();
+      break;
+    }
+  }
+  
+  if (noisiness > DEFAULT_VERBOSITY) {
+    verbose = true;
+  }
+  
+  /* count the number of remaining arguments */
+  if (argc-optind < 1) {
+    convert_tool_usage();
+  }
+  
+  /* print program options, for debugging purposes */
+  print_tool_options();
+  debug("convert_tool_main: orientation='%s'\n", orientation_to_string(orientation));
+  debug("convert_tool_main: strategy='%s'\n", strategy_to_string(strategy));
+  
+  /* parse the remaining command line options */
+  offset = optind;
+  name   = argv[offset++];
+  tensor = timed_tensor_read(name);
+  debug("main: tensor=0x%x\n", tensor);
+  
+  if (strategy == tensor->strategy) {
+    /* we'll deal with differing orientation when it comes up */
+    result = tensor;
+    tensor = NULL;
+  } else {
+    result = timed_tensor_convert(tensor, strategy, orientation);
+  }
+  debug("main: result=0x%x\n", result);
+  
+  timed_tensor_write(argc, argv, offset, result);
+  
+  tensor_free(result);
+  tensor_free(tensor);
+}
diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc
index 6db2945..ad0ce22 100644
--- a/src/tool_effectuate.cc
+++ b/src/tool_effectuate.cc
@@ -26,6 +26,8 @@ extern uint			 cache_line_size;
 extern bool			 human_readable;
 extern uint			 iterations;
 extern uint			 memory_stride;
+extern orientation::type_t       storage_orientation;
+extern strategy::type_t          storage_strategy;
 extern uint			 thread_count;
 extern thread::partition::type_t thread_partition;
 extern char			 *tool_name;
@@ -53,20 +55,25 @@ effectuate_tool_usage()
   message("\t-n\tnumber of times to apply operation (default: %d)\n", DEFAULT_ITERATIONS);
   message("\t-o\toperation (default: %s)\n", operation_to_string(DEFAULT_OPERATION));
   print_operations_with_descriptions("\t\t- %s : %s\n");
+  message("\t-O\torientation (default: %s)\n", orientation_to_string(DEFAULT_ORIENTATION));
+  print_orientations("\t\t- %s\n");
 #if !defined (NOSIMULATE)
   message("\t-s\tsimulate cache (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_SIMULATE));
 #endif
-  message("\t-t\tnumer of thread_count to use (default: %d)\n", DEFAULT_THREAD_COUNT);
+  message("\t-S\tstrategy (default: %s)\n", strategy_to_string(DEFAULT_STRATEGY));
+  print_strategies("\t\t- %s\n");
+  message("\t-p\tpartition scheme for work (default: %s)\n", thread_partition_to_string(DEFAULT_THREAD_PARTITION));
+  print_thread_partitions_with_descriptions("\t\t- %s : %s\n");
+  message("\t-t\tnumber of threads to use for operation (default: %d)\n", DEFAULT_THREAD_COUNT);
   message("\t-T\ttoggle tracing (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_TRACING));
   message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE));
   message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max);
   message("\t-w\twrite results (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_WRITE_RESULTS));
   message("\nExample:\n\n");
-  message("\t$ ./tensor %s -o n-mode vector.in tensor.in matrix.out\n", tool_name);
+  message("\t$ ./tensor %s -o n-mode vector100.in dense100.in\n", tool_name);
   message("\tReading vector.in ... done [0.000305]\n");
   message("\tReading tensor.in ... done [0.000235]\n");
-  message("\tConverting from 'coordinate' to 'compressed-column' ... done [0.000010]\n");
-  message("\tWriting matrix.out ... done [0.000031]\n");
+  message("\tPerforming operation 'dense tensor \times vector product' ... done [3.736000]");
   exit(1);
 }
 
@@ -186,6 +193,8 @@ effectuate_tool_main(int argc, char *argv[])
   operand_association = DEFAULT_ASSOCIATION;
   memory_stride       = DEFAULT_MEMORY_STRIDE;
   optcode             = DEFAULT_OPERATION;
+  storage_orientation = DEFAULT_ORIENTATION;
+  storage_strategy    = DEFAULT_STRATEGY;
   thread_count        = DEFAULT_THREAD_COUNT;
   thread_partition    = DEFAULT_THREAD_PARTITION;
   
@@ -193,7 +202,7 @@ effectuate_tool_main(int argc, char *argv[])
   opterr = 0;
   
   /* extract any command-line options the user provided */
-  while (-1 != (c = getopt(argc, argv, ":a:hl:m:n:o:p:r:st:TuvV:w"))) {
+  while (-1 != (c = getopt(argc, argv, ":a:hl:m:n:o:O:p:r:sS:t:TuvV:w"))) {
     switch (c) {
     case 'a':
       if (isdigit(optarg[0])) {
@@ -230,6 +239,13 @@ effectuate_tool_main(int argc, char *argv[])
 	optcode = string_to_operation(optarg);
       }
       break;
+    case 'O':
+      if (isdigit(optarg[0])) {
+	storage_orientation = (orientation::type_t) atoi(optarg);
+      } else {
+	storage_orientation = string_to_orientation(optarg); 
+      }
+      break;
     case 'p':
       if (isdigit(optarg[0])) {
 	thread_partition = (thread::partition::type_t) atoi(optarg);
@@ -246,6 +262,13 @@ effectuate_tool_main(int argc, char *argv[])
     case 's':
       simulate = !simulate;
       break;
+    case 'S':
+      if (isdigit(optarg[0])) {
+	storage_strategy = (strategy::type_t) atoi(optarg);
+      } else {
+	storage_strategy = string_to_strategy(optarg);
+      }
+      break;
     case 't':
       thread_count = atoi(optarg);
       if (0 == thread_count) {
@@ -296,6 +319,8 @@ effectuate_tool_main(int argc, char *argv[])
   debug("effectuate_tool_main: operand_association='%s'\n", association_to_string(operand_association));
   debug("effectuate_tool_main: operation='%s'\n", operation_to_string(optcode));
   debug("effectuate_tool_main: memory_stride=%d\n", memory_stride);
+  debug("effectuate_tool_main: storage_orientation='%s'\n", orientation_to_string(storage_orientation));
+  debug("effectuate_tool_main: storage_strategy='%s'\n", strategy_to_string(storage_strategy));
   debug("effectuate_tool_main: thread_count=%d\n", thread_count);
   debug("effectuate_tool_main: thread_partition='%s'\n", thread_partition_to_string(thread_partition));
   

From 757a983a8504ad74e7c261f8c03f46c40c659a4f Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 10:59:07 -0700
Subject: [PATCH 45/57] + We are working with DENSE tensors, so reflect this in
 out output messages

---
 src/operation_utility.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operation_utility.cc b/src/operation_utility.cc
index 2c7dd4d..f95c3c0 100644
--- a/src/operation_utility.cc
+++ b/src/operation_utility.cc
@@ -12,7 +12,7 @@ static char const *map_operations_to_string[] = {
 
 static char const *map_operations_to_description[] = { 
   "unknown",
-  "dense vector \\times sparse tensor product"
+  "dense tensor \\times vector product"
 };
 
 char const*

From 025d0dab7f929c0fe906a3ee616257136aaec45a Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 11:01:42 -0700
Subject: [PATCH 46/57] + All operations are now of the form tensor operation
 operand

---
 src/main.cc            |  1 -
 src/operation.h        | 10 ----------
 src/tool.h             |  1 -
 src/tool_effectuate.cc | 10 +---------
 4 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/src/main.cc b/src/main.cc
index 9fc3efa..ee257e2 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -18,7 +18,6 @@
 #include <ctype.h>
 #include <unistd.h>
 
-association::type_t       operand_association;
 cache_t                   *cache;
 uint                      cache_size;
 uint                      cache_line_size;
diff --git a/src/operation.h b/src/operation.h
index 8576ebb..d8f6bce 100644
--- a/src/operation.h
+++ b/src/operation.h
@@ -13,18 +13,8 @@ namespace operation {
   } type_t;
 }
 
-namespace association {
-  typedef enum {
-    unknown,
-    left,
-    right
-  } type_t;
-}
-
-char const* association_to_string(association::type_t association);
 char const* operation_to_string(operation::type_t operation);
 char const* operation_to_description_string(operation::type_t operation);
-association::type_t string_to_association(char const *name);
 operation::type_t string_to_operation(char const *name);
 
 void operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor);
diff --git a/src/tool.h b/src/tool.h
index 9946bbe..d913336 100644
--- a/src/tool.h
+++ b/src/tool.h
@@ -20,7 +20,6 @@ namespace tool {
 #define OPTION_MESSAGE(x,a,b)   (x ? a:b)
 #define DEFAULT_ON_OR_OFF(x)    OPTION_MESSAGE(x, "on", "off")
 
-#define DEFAULT_ASSOCIATION           association::left
 #define DEFAULT_HUMAN_READABLE        true
 #define DEFAULT_ITERATIONS            1
 #define DEFAULT_MEMORY_STRIDE         32
diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc
index ad0ce22..f01c129 100644
--- a/src/tool_effectuate.cc
+++ b/src/tool_effectuate.cc
@@ -190,7 +190,6 @@ effectuate_tool_main(int argc, char *argv[])
   int c;
   
   /* set the program's defaults */
-  operand_association = DEFAULT_ASSOCIATION;
   memory_stride       = DEFAULT_MEMORY_STRIDE;
   optcode             = DEFAULT_OPERATION;
   storage_orientation = DEFAULT_ORIENTATION;
@@ -202,15 +201,8 @@ effectuate_tool_main(int argc, char *argv[])
   opterr = 0;
   
   /* extract any command-line options the user provided */
-  while (-1 != (c = getopt(argc, argv, ":a:hl:m:n:o:O:p:r:sS:t:TuvV:w"))) {
+  while (-1 != (c = getopt(argc, argv, ":hl:m:n:o:O:p:r:sS:t:TuvV:w"))) {
     switch (c) {
-    case 'a':
-      if (isdigit(optarg[0])) {
-	operand_association = (association::type_t) atoi(optarg);
-      } else {
-	operand_association = string_to_association(optarg);
-      }
-      break;
     case 'h': 
       effectuate_tool_usage();
       break;

From 038d3de0e8809acb210c2a3b4bdbaae01f2f8de4 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 11:02:15 -0700
Subject: [PATCH 47/57] + All operations are now of the form tensor operation
 operand

---
 src/tool_effectuate.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc
index f01c129..e857674 100644
--- a/src/tool_effectuate.cc
+++ b/src/tool_effectuate.cc
@@ -19,7 +19,6 @@
 #include <ctype.h>
 #include <unistd.h>
 
-extern association::type_t       operand_association;
 extern cache_t			 *cache;
 extern uint			 cache_size;
 extern uint			 cache_line_size;

From bf75d179b16991955d72c1d3a1f5cf1f16efaa76 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 11:03:42 -0700
Subject: [PATCH 48/57] + All operations are now of the form tensor operation
 operand

---
 src/operation_utility.cc | 26 --------------------------
 src/tool_effectuate.cc   |  1 -
 2 files changed, 27 deletions(-)

diff --git a/src/operation_utility.cc b/src/operation_utility.cc
index f95c3c0..e5a9b83 100644
--- a/src/operation_utility.cc
+++ b/src/operation_utility.cc
@@ -60,29 +60,3 @@ print_operations_with_descriptions(char const *format)
     message(format, map_operations_to_string[i], map_operations_to_description[i]);
   }
 }
-
-static char const *map_associations_to_string[] = { 
-  "unknown",
-  "left",
-  "right"
-};
-
-char const*
-association_to_string(association::type_t association)
-{
-  return map_associations_to_string[association];
-}
-
-association::type_t
-string_to_association(char const *name)
-{
-  uint i;
-  
-  for (i = 0; i < COUNT_OF(map_associations_to_string); ++i) {
-    if (0 == strcmp(name, map_associations_to_string[i])) {
-      return (association::type_t) i;
-    }
-  }
-  
-  return association::unknown;
-}
diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc
index e857674..92f788d 100644
--- a/src/tool_effectuate.cc
+++ b/src/tool_effectuate.cc
@@ -307,7 +307,6 @@ effectuate_tool_main(int argc, char *argv[])
   
   /* print program options, for debugging purposes */
   print_tool_options();
-  debug("effectuate_tool_main: operand_association='%s'\n", association_to_string(operand_association));
   debug("effectuate_tool_main: operation='%s'\n", operation_to_string(optcode));
   debug("effectuate_tool_main: memory_stride=%d\n", memory_stride);
   debug("effectuate_tool_main: storage_orientation='%s'\n", orientation_to_string(storage_orientation));

From b78c5307e89ee2139c397f604a6160ed0793f8cd Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 11:14:00 -0700
Subject: [PATCH 49/57] + Changed partition name tube to fiber (for
 correctness)

---
 src/operation_n_mode_product.cc | 67 +++++----------------------------
 src/thread.cc                   |  4 +-
 src/thread.h                    |  2 +-
 src/tool.h                      |  2 +-
 4 files changed, 13 insertions(+), 62 deletions(-)

diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index 3d259e3..a35b4f8 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -12,7 +12,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-extern association::type_t       operand_association;
 extern cache_t			 *cache;
 extern uint			 memory_stride;
 extern uint			 thread_count;
@@ -30,18 +29,6 @@ extern thread::partition::type_t thread_partition;
       end for
     end for
   end for
-  
-  Computing ($Tp$):
-  Let $\T \in R^{n\times n\times n}$ be a tensor.
-  Let $\M \in R^{n\times n}$ be a matrix.
-  Let $p \in R^{n}$ be a vector.
-  for i = 1 to l do
-    for j = 1 to m do 
-      for k = 1 to m do
-        M[i][j] += T[j][i][k] * p[k]
-      end for
-    end for
-  end for
 */
 
 typedef struct {
@@ -52,7 +39,7 @@ typedef struct {
 } product_thread_data_t;
 
 int
-tube_next(product_thread_data_t *data)
+fiber_next(product_thread_data_t *data)
 {
   volatile uint k;
   
@@ -63,26 +50,12 @@ tube_next(product_thread_data_t *data)
 }
 
 void
-tube_product_pT(product_thread_data_t *data, uint n, double **M, double *P, double *T)
+fiber_product_tube(product_thread_data_t *data, uint n, double **M, double *P, double *T)
 {
   int  t;
   uint i, j, offset;
   
-  while (-1 != (t = tube_next(data))) {
-    offset  = t*n;
-    i       = t/n;
-    j       = t%n;
-    M[i][j] = array_inner_product(n, P, 1, T+offset, 1);
-  }
-}
-
-void
-tube_product_Tp(product_thread_data_t *data, uint n, double **M, double *P, double *T)
-{
-  int  t;
-  uint i, j, offset;
-  
-  while (-1 != (t = tube_next(data))) {
+  while (-1 != (t = fiber_next(data))) {
     offset  = t*n;
     i       = t/n;
     j       = t%n;
@@ -91,17 +64,13 @@ tube_product_Tp(product_thread_data_t *data, uint n, double **M, double *P, doub
 }
 
 thread_address_t
-tube_product(thread_argument_t *argument)
+fiber_product(thread_argument_t *argument)
 {
   product_thread_data_t *data;
   
   data = (product_thread_data_t*) thread_data(argument);
   
-  if (association::left == operand_association) {
-    tube_product_pT(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
-  } else {
-    tube_product_Tp(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
-  }
+  fiber_product_tube(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
   
   return NULL;
 }
@@ -118,7 +87,7 @@ slice_next(product_thread_data_t *data)
 }
 
 void
-slice_product_pT(product_thread_data_t *data, uint n, double **M, double *P, double *T)
+slice_product_horizontal(product_thread_data_t *data, uint n, double **M, double *P, double *T)
 {
   int  i;
   uint j, ioffset, joffset;
@@ -132,20 +101,6 @@ slice_product_pT(product_thread_data_t *data, uint n, double **M, double *P, dou
   }
 }
 
-void
-slice_product_Tp(product_thread_data_t *data, uint n, double **M, double *P, double *T)
-{
-  int  i;
-  uint j, ioffset, joffset;
-  
-  while (-1 != (i = slice_next(data))) {
-    ioffset = i*n*n;
-    for (j = 0; j < n; ++j) {
-      joffset = ioffset+j*n;
-      M[i][j] = array_inner_product(n, P, 1, T+joffset, 1);
-    }
-  }
-}
 
 thread_address_t
 slice_product(thread_argument_t *argument)
@@ -154,11 +109,7 @@ slice_product(thread_argument_t *argument)
   
   data = (product_thread_data_t*) thread_data(argument);
   
-  if (association::left == operand_association) {
-    slice_product_pT(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
-  } else {
-    slice_product_Tp(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
-  }
+  slice_product_horizontal(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
   
   return NULL;
 }
@@ -188,8 +139,8 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
   thread_function_t function;
   
   switch (thread_partition) {
-  case thread::partition::tube:
-    function = (thread_function_t) &tube_product;
+  case thread::partition::fiber:
+    function = (thread_function_t) &fiber_product;
     break;
   case thread::partition::slice:
     function = (thread_function_t) &slice_product;
diff --git a/src/thread.cc b/src/thread.cc
index 186e844..442db57 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -12,13 +12,13 @@
 
 static char const *map_thread_partition_to_string[] = { 
   "unknown",
-  "tube",
+  "fiber",
   "slice"
 };
 
 static char const *map_thread_partition_to_description[] = { 
   "unknown",
-  "tube per thread",
+  "fiber per thread",
   "slice per thread"
 };
 
diff --git a/src/thread.h b/src/thread.h
index ace57b2..3db2bb1 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -20,7 +20,7 @@ namespace thread {
   namespace partition {
     typedef enum {
       unknown,
-      tube,
+      fiber,
       slice
     } type_t;
   }
diff --git a/src/tool.h b/src/tool.h
index d913336..46291ed 100644
--- a/src/tool.h
+++ b/src/tool.h
@@ -29,7 +29,7 @@ namespace tool {
 #define DEFAULT_STRATEGY              strategy::array
 #define DEFAULT_TRACING               false
 #define DEFAULT_THREAD_COUNT          1
-#define DEFAULT_THREAD_PARTITION      thread::partition::tube
+#define DEFAULT_THREAD_PARTITION      thread::partition::fiber
 #define DEFAULT_VERBOSE               false
 #define DEFAULT_VERBOSITY             verbosity::low
 #define DEFAULT_WRITE_RESULTS         false

From 731271c6006883813a1ab52a8d1994e98525702c Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 18:14:47 -0700
Subject: [PATCH 50/57] + Added a UI to handle data partitioning

---
 src/Makefile                    | 14 ++++----
 src/data.cc                     | 57 +++++++++++++++++++++++++++++++++
 src/data.h                      | 21 ++++++++++++
 src/main.cc                     | 39 +++++++++++-----------
 src/operation.cc                |  2 +-
 src/operation_n_mode_product.cc | 37 ++++++++++++---------
 src/thread.cc                   | 52 ------------------------------
 src/thread.h                    | 24 --------------
 src/tool.h                      |  4 +--
 src/tool_effectuate.cc          | 55 +++++++++++++++----------------
 10 files changed, 157 insertions(+), 148 deletions(-)
 create mode 100644 src/data.cc
 create mode 100644 src/data.h

diff --git a/src/Makefile b/src/Makefile
index d47ab7f..50680aa 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -26,9 +26,9 @@ ifeq "$(OS)" "Darwin"
 endif
 
 HEADERS_CACHE=address.h cache.h hash.h
-HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h	\
-	algebra.h memory.h operation.h random.h thread.h strings.h	\
-	timer.h tool.h utility.h compatible.h
+HEADERS_GENERAL=arithmetic.h data.h error.h file.h information.h	\
+	latex.h algebra.h memory.h operation.h queue.h random.h		\
+	thread.h strings.h timer.h tool.h utility.h compatible.h
 HEADERS_GENERATE=generate.h
 HEADERS_MATRIX=matrix.h mmio.h
 HEADERS_TENSOR=tensor.h
@@ -37,12 +37,12 @@ HEADERS=$(HEADERS_CACHE) $(HEADERS_GENERAL) $(HEADERS_GENERATE)	\
 	$(HEADERS_MATRIX) $(HEADERS_TENSOR) $(HEADERS_VECTOR)
 
 SOURCES_CACHE=address.cc cache.cc hash.cc
-SOURCES_GENERAL=arithmetic.cc compatible.cc error.cc file.cc		\
+SOURCES_GENERAL=arithmetic.cc data.cc compatible.cc error.cc file.cc	\
 	information.cc latex.cc algebra.cc memory.cc mmio.cc		\
 	operation.cc operation_n_mode_product.cc operation_utility.cc	\
-	random.cc strings.cc thread.cc timer.cc tool_effectuate.cc	\
-	tool_generate.cc tool_timing.cc tool_utility.cc types.cc	\
-	utility.cc
+	queue.cc random.cc strings.cc thread.cc timer.cc		\
+	tool_effectuate.cc tool_generate.cc tool_timing.cc		\
+	tool_utility.cc types.cc utility.cc
 SOURCES_GENERATE=generate_tensor_from_matrix.cc
 SOURCES_MATRIX=matrix_arithmetic.cc matrix_clear.cc			\
 	matrix_compatible.cc matrix_copy.cc matrix_free.cc		\
diff --git a/src/data.cc b/src/data.cc
new file mode 100644
index 0000000..86086d3
--- /dev/null
+++ b/src/data.cc
@@ -0,0 +1,57 @@
+
+#include "data.h"
+#include "error.h"
+#include "utility.h"
+#include <string.h>
+
+static char const *map_data_partition_to_string[] = { 
+  "unknown",
+  "fiber",
+  "slice"
+};
+
+static char const *map_data_partition_to_description[] = { 
+  "unknown",
+  "fiber per data",
+  "slice per data"
+};
+
+char const*
+data_partition_to_string(data::partition::type_t partition)
+{
+  return map_data_partition_to_string[partition];
+}
+
+data::partition::type_t
+string_to_data_partition(char const *name)
+{
+  uint i;
+  
+  for (i = 0; i < COUNT_OF(map_data_partition_to_string); ++i) {
+    if (0 == strcmp(name, map_data_partition_to_string[i])) {
+      return (data::partition::type_t) i;
+    }
+  }
+  
+  return data::partition::unknown;
+}
+
+void
+print_data_partitions(char const *format)
+{
+  uint i;
+  
+  for (i = 1; i < COUNT_OF(map_data_partition_to_string); ++i) {
+    message(format, map_data_partition_to_string[i]);
+  }
+}
+
+void
+print_data_partitions_with_descriptions(char const *format)
+{
+  uint i;
+  
+  for (i = 1; i < COUNT_OF(map_data_partition_to_string); ++i) {
+    message(format, map_data_partition_to_string[i], map_data_partition_to_description[i]);
+  }
+}
diff --git a/src/data.h b/src/data.h
new file mode 100644
index 0000000..f8ab685
--- /dev/null
+++ b/src/data.h
@@ -0,0 +1,21 @@
+
+#ifndef _DATA_H_
+#define _DATA_H_
+
+namespace data {
+  namespace partition {
+    typedef enum {
+      unknown,
+      fiber,
+      slice
+    } type_t;
+  }
+}
+
+char const* data_partition_to_string(data::partition::type_t partition);
+data::partition::type_t string_to_data_partition(char const *name);
+void print_data_partitions(char const *format);
+void print_data_partitions_with_descriptions(char const *format);
+
+#endif /* _DATA_H_ */
+
diff --git a/src/main.cc b/src/main.cc
index ee257e2..88bc1e7 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -1,6 +1,7 @@
 
 #include "cache.h"
 #include "compatible.h"
+#include "data.h"
 #include "error.h"
 #include "file.h"
 #include "matrix.h"
@@ -18,25 +19,25 @@
 #include <ctype.h>
 #include <unistd.h>
 
-cache_t                   *cache;
-uint                      cache_size;
-uint                      cache_line_size;
-uint                      iterations;
-uint                      memory_stride;
-thread::partition::type_t thread_partition;
-uint                      seed;
-orientation::type_t       storage_orientation;
-strategy::type_t          storage_strategy;
-uint                      thread_count;
-char                      *tool_name;
-tool::type_t              tool_type;
-bool                      tracing;
-bool                      simulate;
-bool                      human_readable;
-bool                      verbose;
-verbosity::type_t         noisiness;
-bool                      write_results;
-bool                      emit_latex;
+cache_t                 *cache;
+uint                    cache_size;
+uint                    cache_line_size;
+uint                    iterations;
+uint                    memory_stride;
+data::partition::type_t data_partition;
+uint                    seed;
+orientation::type_t     storage_orientation;
+strategy::type_t        storage_strategy;
+uint                    thread_count;
+char                    *tool_name;
+tool::type_t            tool_type;
+bool                    tracing;
+bool                    simulate;
+bool                    human_readable;
+bool                    verbose;
+verbosity::type_t       noisiness;
+bool                    write_results;
+bool                    emit_latex;
 
 void
 usage()
diff --git a/src/operation.cc b/src/operation.cc
index c6a8fd5..2cd16d7 100644
--- a/src/operation.cc
+++ b/src/operation.cc
@@ -25,7 +25,7 @@ threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const
     threaded_n_mode_product_array(matrix, vector, tensor);
     break;
   default:
-    die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n",
+    die("threaded_n_mode_product: tensor product for '%s' strategy (using threads) is not currently supported.\n",
 	strategy_to_string(tensor->strategy));
     break;
   }
diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index a35b4f8..9d83fc4 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -1,5 +1,6 @@
 
 #include "algebra.h"
+#include "data.h"
 #include "cache.h"
 #include "compatible.h"
 #include "error.h"
@@ -12,10 +13,12 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-extern cache_t			 *cache;
-extern uint			 memory_stride;
-extern uint			 thread_count;
-extern thread::partition::type_t thread_partition;
+extern cache_t		       *cache;
+extern uint		       memory_stride;
+extern orientation::type_t     storage_orientation;
+extern strategy::type_t        storage_strategy;
+extern uint		       thread_count;
+extern data::partition::type_t data_partition;
 
 /*
   Computing ($pT$):
@@ -38,6 +41,8 @@ typedef struct {
   tensor_t const *tensor;
 } product_thread_data_t;
 
+typedef void (*n_mode_product_t)(product_thread_data_t *data, uint n, double **M, double *P, double *T);
+
 int
 fiber_next(product_thread_data_t *data)
 {
@@ -50,7 +55,7 @@ fiber_next(product_thread_data_t *data)
 }
 
 void
-fiber_product_tube(product_thread_data_t *data, uint n, double **M, double *P, double *T)
+fiber_product_implementation(product_thread_data_t *data, uint n, double **M, double *P, double *T)
 {
   int  t;
   uint i, j, offset;
@@ -70,7 +75,7 @@ fiber_product(thread_argument_t *argument)
   
   data = (product_thread_data_t*) thread_data(argument);
   
-  fiber_product_tube(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
+  fiber_product_implementation(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
   
   return NULL;
 }
@@ -87,21 +92,21 @@ slice_next(product_thread_data_t *data)
 }
 
 void
-slice_product_horizontal(product_thread_data_t *data, uint n, double **M, double *P, double *T)
+slice_product_implementation(product_thread_data_t *data, uint n, double **M, double *P, double *T)
 {
   int  i;
   uint j, ioffset, joffset;
   
   while (-1 != (i = slice_next(data))) {
     ioffset = i*n*n;
+    joffset = ioffset;
     for (j = 0; j < n; ++j) {
-      joffset = ioffset+j*n;
-      M[i][j] = array_inner_product(n, P, 1, T+joffset, 1);
+      M[i][j]  = array_inner_product(n, P, 1, T+joffset, 1);
+      joffset += n;
     }
   }
 }
 
-
 thread_address_t
 slice_product(thread_argument_t *argument)
 {
@@ -109,7 +114,7 @@ slice_product(thread_argument_t *argument)
   
   data = (product_thread_data_t*) thread_data(argument);
   
-  slice_product_horizontal(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
+  slice_product_implementation(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
   
   return NULL;
 }
@@ -138,16 +143,16 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
 {
   thread_function_t function;
   
-  switch (thread_partition) {
-  case thread::partition::fiber:
+  switch (data_partition) {
+  case data::partition::fiber:
     function = (thread_function_t) &fiber_product;
     break;
-  case thread::partition::slice:
+  case data::partition::slice:
     function = (thread_function_t) &slice_product;
     break;
   default:
-    die("serial_n_mode_product: tensor product for '%s' strategy is not currently supported.\n",
-	strategy_to_string(tensor->strategy));
+    die("threaded_n_mode_product_array: tensor product for '%s' partition is not currently supported.\n",
+	data_partition_to_string(data_partition));
     break;
   }
   
diff --git a/src/thread.cc b/src/thread.cc
index 442db57..d3cf79f 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -10,58 +10,6 @@
 #include <stdlib.h>
 #include <errno.h>	/* for EBUSY */
 
-static char const *map_thread_partition_to_string[] = { 
-  "unknown",
-  "fiber",
-  "slice"
-};
-
-static char const *map_thread_partition_to_description[] = { 
-  "unknown",
-  "fiber per thread",
-  "slice per thread"
-};
-
-char const*
-thread_partition_to_string(thread::partition::type_t partition)
-{
-  return map_thread_partition_to_string[partition];
-}
-
-thread::partition::type_t
-string_to_thread_partition(char const *name)
-{
-  uint i;
-  
-  for (i = 0; i < COUNT_OF(map_thread_partition_to_string); ++i) {
-    if (0 == strcmp(name, map_thread_partition_to_string[i])) {
-      return (thread::partition::type_t) i;
-    }
-  }
-  
-  return thread::partition::unknown;
-}
-
-void
-print_thread_partitions(char const *format)
-{
-  uint i;
-  
-  for (i = 1; i < COUNT_OF(map_thread_partition_to_string); ++i) {
-    message(format, map_thread_partition_to_string[i]);
-  }
-}
-
-void
-print_thread_partitions_with_descriptions(char const *format)
-{
-  uint i;
-  
-  for (i = 1; i < COUNT_OF(map_thread_partition_to_string); ++i) {
-    message(format, map_thread_partition_to_string[i], map_thread_partition_to_description[i]);
-  }
-}
-
 /*************************************************
  * attempt to lock a mutex
  */
diff --git a/src/thread.h b/src/thread.h
index 3db2bb1..e565ccb 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -8,30 +8,6 @@
 #ifndef _THREAD_H_
 #define _THREAD_H_
 
-namespace thread {
-  
-  namespace model {
-    typedef enum {
-      unknown,
-      traditional
-    } type_t;
-  }
-  
-  namespace partition {
-    typedef enum {
-      unknown,
-      fiber,
-      slice
-    } type_t;
-  }
-  
-}
-
-char const* thread_partition_to_string(thread::partition::type_t partition);
-thread::partition::type_t string_to_thread_partition(char const *name);
-void print_thread_partitions(char const *format);
-void print_thread_partitions_with_descriptions(char const *format);
-
 /* Linux defs:
  *   _REENTRANT to get thread-safe libs
  *   _POSIX_SOURCE to get POSIX semantics
diff --git a/src/tool.h b/src/tool.h
index 46291ed..b04db18 100644
--- a/src/tool.h
+++ b/src/tool.h
@@ -24,12 +24,12 @@ namespace tool {
 #define DEFAULT_ITERATIONS            1
 #define DEFAULT_MEMORY_STRIDE         32
 #define DEFAULT_OPERATION             operation::n_mode_product
-#define DEFAULT_ORIENTATION           orientation::row
+#define DEFAULT_ORIENTATION           orientation::tube
 #define DEFAULT_SIMULATE              false
 #define DEFAULT_STRATEGY              strategy::array
 #define DEFAULT_TRACING               false
 #define DEFAULT_THREAD_COUNT          1
-#define DEFAULT_THREAD_PARTITION      thread::partition::fiber
+#define DEFAULT_THREAD_PARTITION      data::partition::fiber
 #define DEFAULT_VERBOSE               false
 #define DEFAULT_VERBOSITY             verbosity::low
 #define DEFAULT_WRITE_RESULTS         false
diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc
index 92f788d..fdb2b1b 100644
--- a/src/tool_effectuate.cc
+++ b/src/tool_effectuate.cc
@@ -1,6 +1,7 @@
 
 #include "cache.h"
 #include "compatible.h"
+#include "data.h"
 #include "error.h"
 #include "file.h"
 #include "matrix.h"
@@ -19,23 +20,23 @@
 #include <ctype.h>
 #include <unistd.h>
 
-extern cache_t			 *cache;
-extern uint			 cache_size;
-extern uint			 cache_line_size;
-extern bool			 human_readable;
-extern uint			 iterations;
-extern uint			 memory_stride;
-extern orientation::type_t       storage_orientation;
-extern strategy::type_t          storage_strategy;
-extern uint			 thread_count;
-extern thread::partition::type_t thread_partition;
-extern char			 *tool_name;
-extern tool::type_t		 tool_type;
-extern bool			 simulate;
-extern bool			 tracing;
-extern bool			 verbose;
-extern verbosity::type_t	 noisiness;
-extern bool			 write_results;
+extern cache_t		       *cache;
+extern uint		       cache_size;
+extern uint		       cache_line_size;
+extern bool		       human_readable;
+extern uint		       iterations;
+extern uint		       memory_stride;
+extern orientation::type_t     storage_orientation;
+extern strategy::type_t	       storage_strategy;
+extern uint		       thread_count;
+extern data::partition::type_t data_partition;
+extern char		       *tool_name;
+extern tool::type_t	       tool_type;
+extern bool		       simulate;
+extern bool		       tracing;
+extern bool		       verbose;
+extern verbosity::type_t       noisiness;
+extern bool		       write_results;
 
 static operation::type_t optcode;
 
@@ -54,25 +55,25 @@ effectuate_tool_usage()
   message("\t-n\tnumber of times to apply operation (default: %d)\n", DEFAULT_ITERATIONS);
   message("\t-o\toperation (default: %s)\n", operation_to_string(DEFAULT_OPERATION));
   print_operations_with_descriptions("\t\t- %s : %s\n");
-  message("\t-O\torientation (default: %s)\n", orientation_to_string(DEFAULT_ORIENTATION));
+  message("\t-O\tin memory storage orientation (default: %s)\n", orientation_to_string(DEFAULT_ORIENTATION));
   print_orientations("\t\t- %s\n");
 #if !defined (NOSIMULATE)
   message("\t-s\tsimulate cache (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_SIMULATE));
 #endif
-  message("\t-S\tstrategy (default: %s)\n", strategy_to_string(DEFAULT_STRATEGY));
+  message("\t-S\tin memory storage strategy (default: %s)\n", strategy_to_string(DEFAULT_STRATEGY));
   print_strategies("\t\t- %s\n");
-  message("\t-p\tpartition scheme for work (default: %s)\n", thread_partition_to_string(DEFAULT_THREAD_PARTITION));
-  print_thread_partitions_with_descriptions("\t\t- %s : %s\n");
+  message("\t-p\tpartition scheme for work (default: %s)\n", data_partition_to_string(DEFAULT_THREAD_PARTITION));
+  print_data_partitions_with_descriptions("\t\t- %s : %s\n");
   message("\t-t\tnumber of threads to use for operation (default: %d)\n", DEFAULT_THREAD_COUNT);
   message("\t-T\ttoggle tracing (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_TRACING));
   message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE));
   message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max);
   message("\t-w\twrite results (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_WRITE_RESULTS));
   message("\nExample:\n\n");
-  message("\t$ ./tensor %s -o n-mode vector100.in dense100.in\n", tool_name);
+  message("\t$ ./tensor %s -o n-mode vector.in tensor.in\n", tool_name);
   message("\tReading vector.in ... done [0.000305]\n");
   message("\tReading tensor.in ... done [0.000235]\n");
-  message("\tPerforming operation 'dense tensor \times vector product' ... done [3.736000]");
+  message("\tPerforming operation 'dense tensor \\times vector product' ... done [3.736000]\n");
   exit(1);
 }
 
@@ -194,7 +195,7 @@ effectuate_tool_main(int argc, char *argv[])
   storage_orientation = DEFAULT_ORIENTATION;
   storage_strategy    = DEFAULT_STRATEGY;
   thread_count        = DEFAULT_THREAD_COUNT;
-  thread_partition    = DEFAULT_THREAD_PARTITION;
+  data_partition    = DEFAULT_THREAD_PARTITION;
   
   /* we will privide our own error messages */
   opterr = 0;
@@ -239,9 +240,9 @@ effectuate_tool_main(int argc, char *argv[])
       break;
     case 'p':
       if (isdigit(optarg[0])) {
-	thread_partition = (thread::partition::type_t) atoi(optarg);
+	data_partition = (data::partition::type_t) atoi(optarg);
       } else {
-	thread_partition = string_to_thread_partition(optarg);
+	data_partition = string_to_data_partition(optarg);
       }
       break;
     case 'r':
@@ -312,7 +313,7 @@ effectuate_tool_main(int argc, char *argv[])
   debug("effectuate_tool_main: storage_orientation='%s'\n", orientation_to_string(storage_orientation));
   debug("effectuate_tool_main: storage_strategy='%s'\n", strategy_to_string(storage_strategy));
   debug("effectuate_tool_main: thread_count=%d\n", thread_count);
-  debug("effectuate_tool_main: thread_partition='%s'\n", thread_partition_to_string(thread_partition));
+  debug("effectuate_tool_main: data_partition='%s'\n", data_partition_to_string(data_partition));
   
   /* if we are just running a simulation, then we only do one
      iteration; otherwise, it would be really slow */

From fb5de7753d82bfc5017a16cddd7319d368a52550 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Mon, 14 Nov 2011 18:15:20 -0700
Subject: [PATCH 51/57] + Basic queue support

---
 src/queue.cc | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/queue.h  | 27 ++++++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 src/queue.cc
 create mode 100644 src/queue.h

diff --git a/src/queue.cc b/src/queue.cc
new file mode 100644
index 0000000..6bfcf28
--- /dev/null
+++ b/src/queue.cc
@@ -0,0 +1,63 @@
+
+#include "queue.h"
+#include "error.h"
+#include "memory.h"
+#include "utility.h"
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+queue_t*
+queue_malloc()
+{
+  queue_t *queue;
+  
+  superfluous("queue_malloc(max_size=%d)\n", max_size);
+  
+  queue        = MALLOC(queue_t);
+  queue->first = 0;
+  queue->last  = MAX_QUEUE_SIZE-1;
+  queuecount   = 0;
+  
+  thread_mutex_init(&queue->lock);
+  
+  return queue;
+}
+
+void
+queue_free(queue_t *queue)
+{
+  superfluous("queue_free(queue=0x%x)\n", queue);
+  
+  thread_mutex_destroy(&queue->lock);
+  safe_free(queue);
+}
+void
+queue_push(queue_t *queue, queue_node_t *node, uint x)
+{
+  debug("queue_update(queue=0x%x, node=0x%x, data=0x%x)\n", queue, node, data);
+  
+  thread_mutex_lock(&queue->lock);
+  
+  queue->last              = (queue->last+1) % MAX_QUEUE_SIZE; 
+  queue->data[queue->last] = x;
+  queue->count++;
+  
+  thread_mutex_unlock(&queue->lock);
+}
+
+void
+queue_pop(queue_t *queue)
+{
+  int current, x;
+
+  thread_mutex_lock(&queue->lock);
+  
+  x            = queue->data[queue->first];
+  queue->first = (queue->first+1) % QUEUE_SIZE;
+  queue->count--;
+  
+  return x;
+}
+
+
diff --git a/src/queue.h b/src/queue.h
new file mode 100644
index 0000000..0838754
--- /dev/null
+++ b/src/queue.h
@@ -0,0 +1,27 @@
+
+#ifndef _QUEUE_H_
+#define _QUEUE_H_
+
+#include "thread.h"
+#include "types.h"
+
+#define MAX_QUEUE_SIZE 100
+
+typedef struct queue_tag {
+  uint            data[MAX_QUEUE_SIZE];
+  uint            first, last;
+  pthread_mutex_t lock;
+} queue_t;
+
+queue_t* queue_malloc();
+void queue_free(queue_t *queue);
+void queue_push(queue_t *queue, uint x);
+int queue_pop(queue_t *queue);
+
+#endif /* _HASH_H_ */
+
+/*
+  Local Variables:
+  mode: C++
+  End:
+*/

From 02d6d1b4d753d326e065d11cab56642fe2f15c73 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Tue, 15 Nov 2011 09:34:36 -0700
Subject: [PATCH 52/57] + Added a lock-free queue

---
 src/queue.cc | 98 +++++++++++++++++++++++++++++++++-------------------
 src/queue.h  | 16 ++++-----
 2 files changed, 71 insertions(+), 43 deletions(-)

diff --git a/src/queue.cc b/src/queue.cc
index 6bfcf28..71eb443 100644
--- a/src/queue.cc
+++ b/src/queue.cc
@@ -8,56 +8,84 @@
 #include <string.h>
 
 queue_t*
-queue_malloc()
+queue_malloc(void)
 {
-  queue_t *queue;
+  queue_t *q;
   
-  superfluous("queue_malloc(max_size=%d)\n", max_size);
+  q                 = MALLOC(queue_t);
+  q->head = q->tail = MALLOC(node_t);
   
-  queue        = MALLOC(queue_t);
-  queue->first = 0;
-  queue->last  = MAX_QUEUE_SIZE-1;
-  queuecount   = 0;
-  
-  thread_mutex_init(&queue->lock);
-  
-  return queue;
+  return q;
 }
 
 void
-queue_free(queue_t *queue)
+queue_push(queue_t *q, void *data)
 {
-  superfluous("queue_free(queue=0x%x)\n", queue);
+  node_t *node, *tail, *next;
   
-  thread_mutex_destroy(&queue->lock);
-  safe_free(queue);
-}
-void
-queue_push(queue_t *queue, queue_node_t *node, uint x)
-{
-  debug("queue_update(queue=0x%x, node=0x%x, data=0x%x)\n", queue, node, data);
+  node       = MALLOC(node_t);
+  node->data = data;
+  node->next = NULL;
   
-  thread_mutex_lock(&queue->lock);
+  while (true) {
+    
+    tail = q->tail;
+    next = tail->next;
+    
+    if (tail != q->tail) {
+      continue;
+    }
+    
+    if (NULL != next) {
+      __sync_bool_compare_and_swap(&q->tail, tail, next);
+      continue;
+    }
+    
+    if (__sync_bool_compare_and_swap(&tail->next, NULL, node)) {
+      break;
+    }
+    
+  }
   
-  queue->last              = (queue->last+1) % MAX_QUEUE_SIZE; 
-  queue->data[queue->last] = x;
-  queue->count++;
-  
-  thread_mutex_unlock(&queue->lock);
+  __sync_bool_compare_and_swap(&q->tail, tail, node);
 }
 
-void
-queue_pop(queue_t *queue)
+void*
+queue_pop(queue_t *q)
 {
-  int current, x;
-
-  thread_mutex_lock(&queue->lock);
+  void   *data;
+  node_t *head, *tail, *next;
+  
+  while (true) {
+    
+    head = q->head;
+    tail = q->tail;
+    next = head->next;
+    
+    if (head != q->head) {
+      continue;
+    }
+    
+    if (NULL == next) {
+      return NULL; // Empty
+    }
+    
+    if (head == tail) {
+      __sync_bool_compare_and_swap(&q->tail, tail, next);
+      continue;
+    }
+    
+    data = next->data;
+    
+    if (__sync_bool_compare_and_swap(&q->head, head, next)) {
+      break;
+    }
+    
+  }
   
-  x            = queue->data[queue->first];
-  queue->first = (queue->first+1) % QUEUE_SIZE;
-  queue->count--;
+  safe_free(head);
   
-  return x;
+  return data;
 }
 
 
diff --git a/src/queue.h b/src/queue.h
index 0838754..46e28e0 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -2,21 +2,21 @@
 #ifndef _QUEUE_H_
 #define _QUEUE_H_
 
-#include "thread.h"
 #include "types.h"
 
-#define MAX_QUEUE_SIZE 100
+typedef struct _node_t {
+  void    *data;
+  _node_t *next;
+} node_t;
 
-typedef struct queue_tag {
-  uint            data[MAX_QUEUE_SIZE];
-  uint            first, last;
-  pthread_mutex_t lock;
+typedef struct _queue_t {
+  node_t *head, *tail;
 } queue_t;
 
 queue_t* queue_malloc();
 void queue_free(queue_t *queue);
-void queue_push(queue_t *queue, uint x);
-int queue_pop(queue_t *queue);
+void queue_push(queue_t *queue, void *data);
+void* queue_pop(queue_t *queue);
 
 #endif /* _HASH_H_ */
 

From 98f2317fce07655fc964c0cb53c7d4369c51f15a Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Tue, 15 Nov 2011 09:37:50 -0700
Subject: [PATCH 53/57] + Added Darwin and Linux code to detect the number of
 CPUs/cores

---
 src/thread.cc | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/src/thread.cc b/src/thread.cc
index d3cf79f..4b8edd4 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -10,6 +10,40 @@
 #include <stdlib.h>
 #include <errno.h>	/* for EBUSY */
 
+#ifdef __APPLE__
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#endif
+
+#ifdef __linux__
+#include <sys/sysinfo.h>
+#endif
+
+/*************************************************
+ * get the number of CPUs on this machine
+ */
+int
+thread_get_cpu_count()
+{
+#ifdef __APPLE__
+  int    i;
+  size_t s;
+  
+  i = 0;
+  s = sizeof(i);
+  
+  if (sysctlbyname("hw.ncpu", &i, &s, NULL, 0)) {
+    return 1;
+  }
+  
+  return i;
+#endif
+  
+#ifdef __linux__
+  return get_nprocs();
+#endif
+}
+
 /*************************************************
  * attempt to lock a mutex
  */

From a41bbec7932422b063a9eca0ae1b2499034c11a1 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Tue, 15 Nov 2011 09:38:39 -0700
Subject: [PATCH 54/57] + Added a new data partitioning scheme

---
 src/data.cc                     |  6 ++++--
 src/operation_n_mode_product.cc | 35 ++++++++++++++++++++++-----------
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/src/data.cc b/src/data.cc
index 86086d3..8ce13cc 100644
--- a/src/data.cc
+++ b/src/data.cc
@@ -7,13 +7,15 @@
 static char const *map_data_partition_to_string[] = { 
   "unknown",
   "fiber",
-  "slice"
+  "slice",
+  "fiber-decomposition"
 };
 
 static char const *map_data_partition_to_description[] = { 
   "unknown",
   "fiber per data",
-  "slice per data"
+  "slice per data",
+  "fibers decomposed by binary splitting"
 };
 
 char const*
diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index 9d83fc4..efbfb6f 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -6,6 +6,7 @@
 #include "error.h"
 #include "matrix.h"
 #include "operation.h"
+#include "queue.h"
 #include "thread.h"
 #include "tensor.h"
 #include "utility.h"
@@ -55,7 +56,7 @@ fiber_next(product_thread_data_t *data)
 }
 
 void
-fiber_product_implementation(product_thread_data_t *data, uint n, double **M, double *P, double *T)
+fiber_consumer_implementation(product_thread_data_t *data, uint n, double **M, double *P, double *T)
 {
   int  t;
   uint i, j, offset;
@@ -69,13 +70,13 @@ fiber_product_implementation(product_thread_data_t *data, uint n, double **M, do
 }
 
 thread_address_t
-fiber_product(thread_argument_t *argument)
+fiber_consumer(thread_argument_t *argument)
 {
   product_thread_data_t *data;
   
   data = (product_thread_data_t*) thread_data(argument);
   
-  fiber_product_implementation(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
+  fiber_consumer_implementation(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
   
   return NULL;
 }
@@ -92,7 +93,7 @@ slice_next(product_thread_data_t *data)
 }
 
 void
-slice_product_implementation(product_thread_data_t *data, uint n, double **M, double *P, double *T)
+slice_consumer_implementation(product_thread_data_t *data, uint n, double **M, double *P, double *T)
 {
   int  i;
   uint j, ioffset, joffset;
@@ -108,19 +109,19 @@ slice_product_implementation(product_thread_data_t *data, uint n, double **M, do
 }
 
 thread_address_t
-slice_product(thread_argument_t *argument)
+slice_consumer(thread_argument_t *argument)
 {
   product_thread_data_t *data;
   
   data = (product_thread_data_t*) thread_data(argument);
   
-  slice_product_implementation(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
+  slice_consumer_implementation(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values);
   
   return NULL;
 }
 
 void
-threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor, thread_function_t function)
+threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor, thread_function_t producer, thread_function_t consumer)
 {
   product_thread_data_t data;
   
@@ -135,28 +136,38 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
   data.vector = vector;
   data.tensor = tensor;
   
-  thread_afork(thread_count, function, &data, NULL);
+  if (NULL != producer) {
+    thread_create_detached(producer, &data);
+  }
+  
+  thread_afork(thread_count, consumer, &data, NULL);
 }
 
 void
 threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor)
 {
-  thread_function_t function;
+  thread_function_t consumer, producer;
+  
+  producer = NULL;
+  consumer = NULL;
   
   switch (data_partition) {
   case data::partition::fiber:
-    function = (thread_function_t) &fiber_product;
+    consumer = (thread_function_t) &fiber_consumer;
     break;
   case data::partition::slice:
-    function = (thread_function_t) &slice_product;
+    consumer = (thread_function_t) &slice_consumer;
     break;
+  case data::partition::fiber_decomposition:
+    consumer = (thread_function_t) &subfiber_consumer;
+    producer = (thread_function_t) &subfiber_producer;
   default:
     die("threaded_n_mode_product_array: tensor product for '%s' partition is not currently supported.\n",
 	data_partition_to_string(data_partition));
     break;
   }
   
-  threaded_n_mode_product_array(matrix, vector, tensor, function);
+  threaded_n_mode_product_array(matrix, vector, tensor, producer, consumer);
 }
  
 void

From 96a2f9cfdfab7c9a359391ab1f3d92fbdcfb64df Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Tue, 15 Nov 2011 11:32:08 -0700
Subject: [PATCH 55/57] + Added a 'semi-block recursive' data partitioning
 scheme

---
 src/compatible.cc               |  2 +-
 src/data.cc                     |  8 +++---
 src/data.h                      |  3 ++-
 src/operation_n_mode_product.cc | 45 ++++++++++++++++++++++++++-------
 4 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/src/compatible.cc b/src/compatible.cc
index a46c7b5..773acb1 100644
--- a/src/compatible.cc
+++ b/src/compatible.cc
@@ -28,7 +28,7 @@ compatible(vector_t const *lhs, tensor_t const *rhs)
 	strategy_to_string(rhs->strategy));
   }
   
-  compatible = (lhs->n == rhs->l);
+  compatible = (lhs->n == rhs->n);
   
   if (!compatible) {
     print_information(lhs);
diff --git a/src/data.cc b/src/data.cc
index 8ce13cc..492fa26 100644
--- a/src/data.cc
+++ b/src/data.cc
@@ -8,14 +8,14 @@ static char const *map_data_partition_to_string[] = {
   "unknown",
   "fiber",
   "slice",
-  "fiber-decomposition"
+  "block"
 };
 
 static char const *map_data_partition_to_description[] = { 
   "unknown",
-  "fiber per data",
-  "slice per data",
-  "fibers decomposed by binary splitting"
+  "fiber per thread",
+  "slice per thread",
+  "block per thread"
 };
 
 char const*
diff --git a/src/data.h b/src/data.h
index f8ab685..db7d1a5 100644
--- a/src/data.h
+++ b/src/data.h
@@ -7,7 +7,8 @@ namespace data {
     typedef enum {
       unknown,
       fiber,
-      slice
+      slice,
+      block
     } type_t;
   }
 }
diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index efbfb6f..b3b5693 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -81,6 +81,39 @@ fiber_consumer(thread_argument_t *argument)
   return NULL;
 }
 
+void
+block_consumer_implementation(product_thread_data_t *data, uint n, uint start, uint end, double **M, double *P, double *T)
+{
+  uint i, j, t, offset;
+  
+  for (t = start; t < end; ++t) {
+    offset  = t*n;
+    i       = t/n;
+    j       = t%n;
+    M[i][j] = array_inner_product(n, P, 1, T+offset, 1);
+  }
+}
+
+thread_address_t
+block_consumer(thread_argument_t *argument)
+{
+  int                   id;
+  uint                  n, block_size;
+  uint                  start, end;
+  product_thread_data_t *data;
+  
+  data       = (product_thread_data_t*) thread_data(argument);
+  n          = data->tensor->n;
+  block_size = (n*n)/thread_count;
+  id         = thread_myid(argument);
+  start      = block_size*id;
+  end        = start+block_size;
+  
+  block_consumer_implementation(data, n, start, end, data->matrix->data, data->vector->data, data->tensor->values);
+  
+  return NULL;
+}
+
 int
 slice_next(product_thread_data_t *data)
 {
@@ -125,12 +158,6 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
 {
   product_thread_data_t data;
   
-  memory_stride = memory_stride > tensor->n ? tensor->n : memory_stride;
-  thread_count  = thread_count > tensor->n ? tensor->n : thread_count;
-  
-  debug("threaded_n_mode_product_array: memory_stride=%d\n", memory_stride);
-  debug("threaded_n_mode_product_array: thread_count=%d\n", thread_count);
-  
   data.done   = 0;
   data.matrix = matrix;
   data.vector = vector;
@@ -158,9 +185,9 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
   case data::partition::slice:
     consumer = (thread_function_t) &slice_consumer;
     break;
-  case data::partition::fiber_decomposition:
-    consumer = (thread_function_t) &subfiber_consumer;
-    producer = (thread_function_t) &subfiber_producer;
+  case data::partition::block:
+    consumer = (thread_function_t) &block_consumer;
+    break;
   default:
     die("threaded_n_mode_product_array: tensor product for '%s' partition is not currently supported.\n",
 	data_partition_to_string(data_partition));

From 7cd401edc1b177508a3f8d2f6ea46ce0ef633d35 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Tue, 15 Nov 2011 12:05:47 -0700
Subject: [PATCH 56/57] + Added fiber and block data partitioning

---
 src/data.cc                     |  6 ++--
 src/data.h                      |  3 +-
 src/operation_n_mode_product.cc | 52 +++++++++++++++++++++++++++++----
 3 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/src/data.cc b/src/data.cc
index 492fa26..981fc56 100644
--- a/src/data.cc
+++ b/src/data.cc
@@ -7,15 +7,17 @@
 static char const *map_data_partition_to_string[] = { 
   "unknown",
   "fiber",
+  "fiber-block",
   "slice",
-  "block"
+  "slice-block"
 };
 
 static char const *map_data_partition_to_description[] = { 
   "unknown",
   "fiber per thread",
+  "block of fibers per thread"
   "slice per thread",
-  "block per thread"
+  "block of slices per thread"
 };
 
 char const*
diff --git a/src/data.h b/src/data.h
index db7d1a5..5c15991 100644
--- a/src/data.h
+++ b/src/data.h
@@ -7,8 +7,9 @@ namespace data {
     typedef enum {
       unknown,
       fiber,
+      fiber_block,
       slice,
-      block
+      slice_block
     } type_t;
   }
 }
diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc
index b3b5693..cd83589 100644
--- a/src/operation_n_mode_product.cc
+++ b/src/operation_n_mode_product.cc
@@ -82,7 +82,7 @@ fiber_consumer(thread_argument_t *argument)
 }
 
 void
-block_consumer_implementation(product_thread_data_t *data, uint n, uint start, uint end, double **M, double *P, double *T)
+fiber_block_consumer_implementation(product_thread_data_t *data, uint n, uint start, uint end, double **M, double *P, double *T)
 {
   uint i, j, t, offset;
   
@@ -95,7 +95,7 @@ block_consumer_implementation(product_thread_data_t *data, uint n, uint start, u
 }
 
 thread_address_t
-block_consumer(thread_argument_t *argument)
+fiber_block_consumer(thread_argument_t *argument)
 {
   int                   id;
   uint                  n, block_size;
@@ -104,12 +104,12 @@ block_consumer(thread_argument_t *argument)
   
   data       = (product_thread_data_t*) thread_data(argument);
   n          = data->tensor->n;
-  block_size = (n*n)/thread_count;
   id         = thread_myid(argument);
+  block_size = (n*n)/thread_count;
   start      = block_size*id;
   end        = start+block_size;
   
-  block_consumer_implementation(data, n, start, end, data->matrix->data, data->vector->data, data->tensor->values);
+  fiber_block_consumer_implementation(data, n, start, end, data->matrix->data, data->vector->data, data->tensor->values);
   
   return NULL;
 }
@@ -153,6 +153,43 @@ slice_consumer(thread_argument_t *argument)
   return NULL;
 }
 
+void
+slice_block_consumer_implementation(int id, product_thread_data_t *data, uint n, uint start, uint end, double **M, double *P, double *T)
+{
+  uint i, j, ioffset, joffset;
+  
+  for (i = start; i < end; ++i) {
+    ioffset = i*n*n;
+    joffset = ioffset;
+    for (j = 0; j < n; ++j) {
+      M[i][j]  = array_inner_product(n, P, 1, T+joffset, 1);
+      joffset += n;
+    }
+  }
+}
+
+thread_address_t
+slice_block_consumer(thread_argument_t *argument)
+{
+  int                   id;
+  uint                  n, block_size;
+  uint                  start, end;
+  product_thread_data_t *data;
+  
+  data       = (product_thread_data_t*) thread_data(argument);
+  n          = data->tensor->n;
+  id         = thread_myid(argument);
+  block_size = n/thread_count;
+  start      = block_size*id;
+  end        = start+block_size;
+  
+  DEBUG("thread:%d: block_size=%d/%d=%d, start=%d, end=%d\n", id, n, thread_count, block_size, start, end);
+  
+  slice_block_consumer_implementation(id, data, n, start, end, data->matrix->data, data->vector->data, data->tensor->values);
+  
+  return NULL;
+}
+
 void
 threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor, thread_function_t producer, thread_function_t consumer)
 {
@@ -182,11 +219,14 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t
   case data::partition::fiber:
     consumer = (thread_function_t) &fiber_consumer;
     break;
+  case data::partition::fiber_block:
+    consumer = (thread_function_t) &fiber_block_consumer;
+    break;
   case data::partition::slice:
     consumer = (thread_function_t) &slice_consumer;
     break;
-  case data::partition::block:
-    consumer = (thread_function_t) &block_consumer;
+  case data::partition::slice_block:
+    consumer = (thread_function_t) &slice_block_consumer;
     break;
   default:
     die("threaded_n_mode_product_array: tensor product for '%s' partition is not currently supported.\n",

From 3fd5b17aabbf0829f30508a43be5e94493a5d330 Mon Sep 17 00:00:00 2001
From: Ben Burnett <ben.burnett@gmail.com>
Date: Thu, 17 Nov 2011 14:45:53 -0700
Subject: [PATCH 57/57] + Fixed gcc warning about the use of an unitialized
 variable

---
 src/matrix_write.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/matrix_write.cc b/src/matrix_write.cc
index f1b2fa1..4c8f77b 100644
--- a/src/matrix_write.cc
+++ b/src/matrix_write.cc
@@ -123,7 +123,7 @@ matrix_write(char const *filename, matrix_t const *matrix, format::type_t format
 {
   FILE *file;
   
-  debug("matrix_write(0x%x)\n", file);
+  debug("matrix_write('%s')\n", filename);
   
   file = fopen_or_die(filename, "w+");
   matrix_fwrite(file, matrix, format);