From 0644cb627cd41c58b37a634f0de9f262161aa8bc Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 14:05:45 -0700 Subject: [PATCH 01/57] + Fixed bug in which an unknown file type is never detected --- src/tensor_read.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tensor_read.cc b/src/tensor_read.cc index c7908bb..376b37c 100644 --- a/src/tensor_read.cc +++ b/src/tensor_read.cc @@ -339,6 +339,8 @@ detect_file_format(FILE *file) debug("detect_file_format(0x%x)\n", file); + format = file_format::unknown; + if (EOF != (c = peek(file))) { if ('%' == c) { format = file_format::mmio; From 868e7e90a717f67b5410d6ea7dda142944a3ed84 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 14:06:51 -0700 Subject: [PATCH 02/57] + Added global thread count variable --- src/main.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main.cc b/src/main.cc index e74133a..1f10169 100644 --- a/src/main.cc +++ b/src/main.cc @@ -22,6 +22,7 @@ uint cache_size; uint cache_line_size; uint iterations; uint seed; +uint threads; char *tool_name; tool::type_t tool_type; bool tracing; From 0a7e82a815107aa34bcc9f20422f13cc12919289 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 14:07:55 -0700 Subject: [PATCH 03/57] + Changed matrix to be stored as a contiguous vector --- src/matrix_free.cc | 6 +----- src/matrix_malloc.cc | 5 ++++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/matrix_free.cc b/src/matrix_free.cc index dff8863..f3ec950 100644 --- a/src/matrix_free.cc +++ b/src/matrix_free.cc @@ -11,8 +11,6 @@ void matrix_free(matrix_t *matrix) { - uint i; - superfluous("matrix_free(matrix=0x%x)\n", matrix); if (!matrix) { @@ -20,9 +18,7 @@ matrix_free(matrix_t *matrix) } if (ownership::creator == matrix->owner) { - for (i = 0; i < matrix->m; ++i) { - safe_free(matrix->data[i]); - } + safe_free(matrix->data[0]); safe_free(matrix->data); } diff --git a/src/matrix_malloc.cc b/src/matrix_malloc.cc index c541477..f4e45b6 100644 --- a/src/matrix_malloc.cc +++ b/src/matrix_malloc.cc @@ -44,6 +44,7 @@ matrix_malloc(uint m, uint n, ownership::type_t owner) { uint i; matrix_t *mr; + double *p; superfluous("matrix_malloc(m=%d, n=%d, owner='%s')\n", m, n, ownership_to_string(owner)); @@ -59,8 +60,10 @@ matrix_malloc(uint m, uint n, ownership::type_t owner) } mr->data = MALLOC_N(double*, m); + p = MALLOC_N(double, m*n); for (i = 0; i < m; ++i) { - mr->data[i] = MALLOC_N(double, n); + mr->data[i] = p; + p += n; } return mr; From e0c37d237dcad6a64fa03c919f17d5a4c76ab65a Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 14:08:42 -0700 Subject: [PATCH 04/57] + Added command line support for thread count to use during execution --- src/tool.h | 1 + src/tool_effectuate.cc | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/tool.h b/src/tool.h index 01a4fd4..851792d 100644 --- a/src/tool.h +++ b/src/tool.h @@ -30,6 +30,7 @@ namespace tool { #define DEFAULT_SIMULATE false #define DEFAULT_STRATEGY strategy::compressed #define DEFAULT_TRACING false +#define DEFAULT_THREAD_COUNT 1 #define DEFAULT_VERBOSE false #define DEFAULT_VERBOSITY verbosity::low #define DEFAULT_WRITE_RESULTS false diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc index 68e0714..fc339f3 100644 --- a/src/tool_effectuate.cc +++ b/src/tool_effectuate.cc @@ -23,6 +23,7 @@ extern uint cache_size; extern uint cache_line_size; extern uint iterations; extern bool human_readable; +extern uint threads; extern char *tool_name; extern tool::type_t tool_type; extern bool simulate; @@ -51,7 +52,8 @@ effectuate_tool_usage() #if !defined (NOSIMULATE) message("\t-s\tsimulate cache (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_SIMULATE)); #endif - message("\t-t\ttoggle tracing (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_TRACING)); + message("\t-t\tnumer of threads to use (default: %d)\n", DEFAULT_THREAD_COUNT); + message("\t-T\ttoggle tracing (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_TRACING)); message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE)); message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max); message("\t-w\twrite results (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_WRITE_RESULTS)); @@ -183,7 +185,7 @@ effectuate_tool_main(int argc, char *argv[]) opterr = 0; /* extract any command-line options the user provided */ - while (-1 != (c = getopt(argc, argv, ":hl:m:n:o:p:stuvV:w"))) { + while (-1 != (c = getopt(argc, argv, ":hl:m:n:o:p:st:TuvV:w"))) { switch (c) { case 'h': effectuate_tool_usage(); @@ -217,6 +219,12 @@ effectuate_tool_main(int argc, char *argv[]) simulate = !simulate; break; case 't': + threads = atoi(optarg); + if (0 == threads) { + threads = DEFAULT_THREAD_COUNT; + } + break; + case 'T': tracing = !tracing; break; case 'u': From f03f7c3c6ebfc0248708e9faeb3c9926e23bd55f Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 14:20:44 -0700 Subject: [PATCH 05/57] + Removed code that drops zero entries --- src/tensor_write.cc | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/tensor_write.cc b/src/tensor_write.cc index 42fc5a2..8b86c97 100644 --- a/src/tensor_write.cc +++ b/src/tensor_write.cc @@ -36,21 +36,6 @@ tensor_fwrite_coordinate(FILE *file, tensor_t const *tensor) die("Could not write Tensor Market banner (%d).\n", result); } -#if 0 - storage = STORAGE_COORIDINATE(tensor); - tuples = storage->tuples; - nnz = 0; - - for (i = 0; i < tensor->nnz; ++i) { - if (!might_as_well_be_zero(tensor->values[i])) { - nnz++; - } - } - - debug("tensor_write_coordinate: non-zero values: implied=%d, actual=%d.\n", tensor->nnz, nnz); - debug("tensor_write_coordinate: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n); -#endif - debug("tensor_write_coordinate: non-zero values: actual=%d.\n", tensor->nnz); debug("tensor_write_coordinate: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n); From 5f7961604bd3fd937d006aa4bcd7a8dd322eb657 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 15:21:41 -0700 Subject: [PATCH 06/57] + Since we support array tensors, we need to enable support in the lower level functions that do sanity checks for us --- src/compatible.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/compatible.cc b/src/compatible.cc index 1f9d1ac..1a14ecb 100644 --- a/src/compatible.cc +++ b/src/compatible.cc @@ -11,6 +11,7 @@ compatible(vector_t const *lhs, tensor_t const *rhs) debug("compatible(vector=0x%x, tensor=0x%x)\n", lhs, rhs); switch (rhs->strategy) { + case strategy::array: case strategy::compressed: case strategy::slice: case strategy::ekmr: @@ -23,7 +24,7 @@ compatible(vector_t const *lhs, tensor_t const *rhs) } if (!supported) { - die("Tensor strategy '%s' is not currently supported.\n", + die("compatible: tensor strategy '%s' is not currently supported.\n", strategy_to_string(rhs->strategy)); } From 39ee85e40e8b6eacf6457672aa680c4d560f4d3b Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 15:25:13 -0700 Subject: [PATCH 07/57] + All tensor/vector combinations work, so long as the tensor's tube is the correct length --- src/compatible.cc | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/compatible.cc b/src/compatible.cc index 1a14ecb..2f7ddfe 100644 --- a/src/compatible.cc +++ b/src/compatible.cc @@ -28,24 +28,12 @@ compatible(vector_t const *lhs, tensor_t const *rhs) strategy_to_string(rhs->strategy)); } - switch (rhs->orientation) { - case orientation::row: - case orientation::column: - case orientation::tube: - case orientation::lateral: - case orientation::horizontal: - case orientation::frontal: - compatible = (lhs->n == rhs->l); - break; - default: - compatible = false; - break; - } + compatible = (lhs->n == rhs->l); if (!compatible) { print_information(lhs); print_information(rhs); - die("Tensors and vector do not have matching dimensions.\n"); + die("Tensor and vector do not have matching dimensions.\n"); } } From 736e3fbc4c681ba4b3af27eb534f3780025a2580 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 17:56:18 -0700 Subject: [PATCH 08/57] + Added threding code in to makefile --- src/Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Makefile b/src/Makefile index 675cf1f..6265660 100644 --- a/src/Makefile +++ b/src/Makefile @@ -16,7 +16,7 @@ ifndef SIMULATE EXTRA_DEBUG += -DNOSIMULATE endif EXTRA_CXXFLAGS=-c $(EXTRA_DEBUG) $(STRICT) $(INCLUDES) $(CPPX11) -EXTRA_LDFLAGS=-Wall $(EXTRA_DEBUG) +EXTRA_LDFLAGS=-Wall -lpthread $(EXTRA_DEBUG) HEADERS_CACHE=address.h cache.h hash.h HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h \ @@ -32,10 +32,11 @@ HEADERS=$(HEADERS_CACHE) $(HEADERS_GENERAL) $(HEADERS_GENERATE) \ SOURCES_CACHE=address.cc cache.cc hash.cc SOURCES_GENERAL=arithmetic.cc compatible.cc error.cc file.cc \ information.cc latex.cc memory.cc mmio.cc \ - operation_n_mode_product.cc operation_utility.cc random.cc \ - strings.cc timer.cc tool_convert.cc tool_effectuate.cc \ - tool_generate.cc tool_permute.cc tool_timing.cc \ - tool_utility.cc types.cc utility.cc + operation_n_mode_product.cc \ + operation_threaded_n_mode_product.cc operation_utility.cc \ + random.cc strings.cc timer.cc tool_convert.cc \ + tool_effectuate.cc tool_generate.cc tool_permute.cc \ + tool_timing.cc tool_utility.cc types.cc utility.cc SOURCES_GENERATE=generate_tensor_from_matrix.cc SOURCES_MATRIX=matrix_arithmetic.cc matrix_clear.cc \ matrix_compatible.cc matrix_copy.cc matrix_free.cc \ From 8618749008a73fb536ad4ef9e0c0c8a99c78e374 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 17:58:47 -0700 Subject: [PATCH 09/57] + Added threading support for n-mode product + Global threads => thread_cound --- src/main.cc | 2 +- src/operation_n_mode_product.cc | 24 ++++++++++++++++++++---- src/tool_effectuate.cc | 10 +++++----- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/src/main.cc b/src/main.cc index 1f10169..2b893fa 100644 --- a/src/main.cc +++ b/src/main.cc @@ -22,7 +22,7 @@ uint cache_size; uint cache_line_size; uint iterations; uint seed; -uint threads; +uint thread_count; char *tool_name; tool::type_t tool_type; bool tracing; diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index 8d60446..ac0ee24 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -10,6 +10,9 @@ #include #include +extern cache_t *cache; +extern uint thread_count; + /* Computing ($pT$): Let $\T \in R^{n\times n\times n}$ be a tensor. @@ -24,8 +27,6 @@ end for */ -extern cache_t *cache; - void compressed_row(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { @@ -412,9 +413,9 @@ n_mode_product_ekmr(matrix_t *matrix, vector_t const *vector, tensor_t const *te } void -operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { - debug("operation_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + debug("serial_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); compatible(vector, tensor); @@ -438,6 +439,21 @@ operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t cons } } +extern void +threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor); + +void +operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +{ + debug("operation_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + + if (1 == thread_count) { + serial_n_mode_product(matrix, vector, tensor); + } else { + threaded_n_mode_product(matrix, vector, tensor); + } +} + matrix_t* operation_n_mode_product(vector_t const *vector, tensor_t const *tensor) { diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc index fc339f3..b16ce94 100644 --- a/src/tool_effectuate.cc +++ b/src/tool_effectuate.cc @@ -23,7 +23,7 @@ extern uint cache_size; extern uint cache_line_size; extern uint iterations; extern bool human_readable; -extern uint threads; +extern uint thread_count; extern char *tool_name; extern tool::type_t tool_type; extern bool simulate; @@ -52,7 +52,7 @@ effectuate_tool_usage() #if !defined (NOSIMULATE) message("\t-s\tsimulate cache (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_SIMULATE)); #endif - message("\t-t\tnumer of threads to use (default: %d)\n", DEFAULT_THREAD_COUNT); + message("\t-t\tnumer of thread_count to use (default: %d)\n", DEFAULT_THREAD_COUNT); message("\t-T\ttoggle tracing (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_TRACING)); message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE)); message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max); @@ -219,9 +219,9 @@ effectuate_tool_main(int argc, char *argv[]) simulate = !simulate; break; case 't': - threads = atoi(optarg); - if (0 == threads) { - threads = DEFAULT_THREAD_COUNT; + thread_count = atoi(optarg); + if (0 == thread_count) { + thread_count = DEFAULT_THREAD_COUNT; } break; case 'T': From 2f90d1ceeda297f29fe2e48d197972efdf50c52e Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 17:59:52 -0700 Subject: [PATCH 10/57] + Array tensors now read and write array formats rather than coordinate formats --- src/tensor_read.cc | 38 +++++++++++++++----------------------- src/tensor_write.cc | 43 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 27 deletions(-) diff --git a/src/tensor_read.cc b/src/tensor_read.cc index 376b37c..0fcb404 100644 --- a/src/tensor_read.cc +++ b/src/tensor_read.cc @@ -10,13 +10,11 @@ tensor_t* tensor_fread_array(FILE *file) { - int i, j, k, v; - int l, m, n, nnz; - int result; - double d; + int i, j, k, index; + int l, m, n; + int line, result; tensor_t *tensor; - tensor_storage_coordinate_t *storage; - coordinate_tuple_t *tuples; + double *T; debug("tensor_fread_array(0x%x)\n", file); @@ -26,24 +24,18 @@ tensor_fread_array(FILE *file) debug("tensor_fread_array: l=%d, m=%d, n=%d\n", l, m, n); - nnz = l*m*n; - tensor = tensor_malloc(l, m, n, nnz, strategy::coordinate); - storage = STORAGE_COORIDINATE(tensor); - tuples = storage->tuples; - v = 0; - - for (k = 0; k < l; ++k) { - for (i = 0; i < m; ++i) { - for (j = 0; j < n; ++j) { - if (1 != (result = fscanf(file, "%lg\n", &d))) { - die("Failed to process line %d of the input stream (%d).\n", v, result); + tensor = tensor_malloc(l, m, n); + T = tensor->values; + line = 0; + + for (i = 0; i < m; ++i) { + for (j = 0; j < n; ++j) { + for (k = 0; k < l; ++k) { + index = tensor_index(tensor, i, j, k); + if (1 != (result = fscanf(file, "%lg\n", &T[index]))) { + die("Failed to process line %d of the input stream (%d).\n", line, result); } - tensor->values[v] = d; - tuples[v].i = i; - tuples[v].j = j; - tuples[v].k = k; - tuples[v].index = v; - v++; + line++; } } } diff --git a/src/tensor_write.cc b/src/tensor_write.cc index 8b86c97..8462f7c 100644 --- a/src/tensor_write.cc +++ b/src/tensor_write.cc @@ -19,6 +19,38 @@ tensor_initialize_typecode(MM_typecode *type, strategy::type_t strategy) mm_set_real(type); } +void +tensor_fwrite_array(FILE *file, tensor_t const *tensor) +{ + int i, j, k; + int l, m, n; + int result; + MM_typecode type; + double ***T; + + debug("tensor_write_array(file=0x%x, tensor=0x%x)\n", file, tensor); + + tensor_initialize_typecode(&type, strategy::array); + + if (0 != (result = mm_write_banner(file, type))) { + die("Could not write Tensor Market banner (%d).\n", result); + } + + debug("tensor_write_array: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n); + + if (0 != (result = mm_write_tensor_array_size(file, tensor->l, tensor->m, tensor->n))) { + die("Failed to write array tensor of size %d x %d x %d (%d).\n", tensor->l, tensor->m, tensor->n, result); + } + + for (i = 0; i < m; ++i) { + for (j = 0; j < n; ++j) { + for (k = 0; k < l; ++k) { + fprintf(file, "%10.6g\n", T[i][j][k]); + } + } + } +} + void tensor_fwrite_coordinate(FILE *file, tensor_t const *tensor) { @@ -47,7 +79,7 @@ tensor_fwrite_coordinate(FILE *file, tensor_t const *tensor) tuples = storage->tuples; for (i = 0; i < tensor->nnz; ++i) { - fprintf(file, "%d %d %d %10.32g\n", tuples[i].k, tuples[i].i, tuples[i].j, tensor->values[tuples[i].index]); + fprintf(file, "%d %d %d %10.6g\n", tuples[i].k, tuples[i].i, tuples[i].j, tensor->values[tuples[i].index]); } } @@ -95,7 +127,7 @@ tensor_fwrite_compressed(FILE *file, tensor_t const *tensor) } for (i = 0; i < nnz; ++i) { - fprintf(file, "%d %d %10.32g\n", storage->CO[i], storage->KO[i], tensor->values[i]); + fprintf(file, "%d %d %10.6g\n", storage->CO[i], storage->KO[i], tensor->values[i]); } } @@ -145,7 +177,7 @@ tensor_fwrite_compressed_slice(FILE *file, tensor_t const *tensor) } for (i = 0; i < nnz; ++i) { - fprintf(file, "%d %10.32g\n", storage->KO[i], tensor->values[i]); + fprintf(file, "%d %10.6g\n", storage->KO[i], tensor->values[i]); } } @@ -195,7 +227,7 @@ tensor_fwrite_extended_compressed(FILE *file, tensor_t const *tensor) } for (i = 0; i < nnz; ++i) { - fprintf(file, "%d %10.32g\n", storage->CK[i], tensor->values[i]); + fprintf(file, "%d %10.6g\n", storage->CK[i], tensor->values[i]); } } @@ -206,6 +238,9 @@ tensor_fwrite_implementation(FILE *file, tensor_t const *tensor) debug("tensor_fwrite_implementation: strategy='%s'\n", strategy_to_string(tensor->strategy)); switch (tensor->strategy) { + case strategy::array: + tensor_fwrite_array(file, tensor); + break; case strategy::coordinate: tensor_fwrite_coordinate(file, tensor); break; From cc67ca542f18112b7a9bd37a544d5366edadbf85 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 18:01:00 -0700 Subject: [PATCH 11/57] + Added a simple inline function to calculate offsets in to a tensor (when stored as vector) --- src/tensor.h | 1 + src/tensor_malloc.cc | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/tensor.h b/src/tensor.h index d3c14ca..378eb54 100644 --- a/src/tensor.h +++ b/src/tensor.h @@ -112,6 +112,7 @@ tensor_t* tensor_malloc(uint l, uint m, uint n, uint nnz, strategy::type_t strat orientation::type_t orientation = orientation::unknown, ownership::type_t owner = ownership::creator); tensor_t* tensor_malloc_from_template(tensor_t const *tensor); +uint tensor_index(tensor_t const *tensor, uint i, uint j, uint k); void tensor_free(tensor_t *tensor); tensor_t* tensor_copy_shallow(tensor_t *source); diff --git a/src/tensor_malloc.cc b/src/tensor_malloc.cc index f55f93d..ddce7ce 100644 --- a/src/tensor_malloc.cc +++ b/src/tensor_malloc.cc @@ -8,6 +8,11 @@ #include #include +uint +tensor_index(tensor_t const *tensor, uint i, uint j, uint k) { + return (i*tensor->n*tensor->m) + (j*tensor->m) + k; +} + tensor_t* tensor_malloc(uint l, uint m, uint n, ownership::type_t owner) { @@ -30,8 +35,8 @@ tensor_malloc(uint l, uint m, uint n, ownership::type_t owner) return tensor; } - tensor->values = MALLOC_N(double, l*m*n); - + tensor->values = MALLOC_N(double, l*m*n); + superfluous("tensor_malloc: tensor->values=0x%x\n", tensor->values); superfluous("tensor_malloc: tensor=0x%x\n", tensor); From 134006df19194caf61395fa0201b0ed449c7ca1c Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 18:02:57 -0700 Subject: [PATCH 12/57] + Serial and threaded n-mode product calculation for dense array tensors --- src/operation_threaded_n_mode_product.cc | 157 +++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 src/operation_threaded_n_mode_product.cc diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc new file mode 100644 index 0000000..2ebe875 --- /dev/null +++ b/src/operation_threaded_n_mode_product.cc @@ -0,0 +1,157 @@ + +#include "cache.h" +#include "compatible.h" +#include "error.h" +#include "matrix.h" +#include "operation.h" +#include "tensor.h" +#include "utility.h" +#include "vector.h" +#include +#include +#include + +extern cache_t *cache; +extern uint thread_count; + +/* + Computing ($pT$): + Let $\T \in R^{n\times n\times n}$ be a tensor. + Let $\M \in R^{n\times n}$ be a matrix. + Let $p \in R^{n}$ be a vector. + for i = 1 to l do + for j = 1 to m do + for k = 1 to m do + M[i][j] += p[k] * T[i][j][k] + end for + end for + end for +*/ + +typedef struct { + uint done; + matrix_t *matrix; + vector_t const *vector; + tensor_t const *tensor; +} product_thread_data_t; + +static pthread_mutex_t tube_lock; + +int +next_tube(product_thread_data_t *data) +{ + uint k; + + pthread_mutex_lock(&tube_lock); + k = data->done++; + pthread_mutex_unlock(&tube_lock); + return k < (data->tensor->m*data->tensor->n) ? k : -1; +} + +void* +fiber_product(void *arg) +{ + int k; + product_thread_data_t *data; + uint i, j, k, index; + uint m, n, l; + uint *P; + double **M, *T; + + data = (product_thread_data_t*) arg; + + M = matrix->data; + P = vector->data; + T = tensor->values; + + l = tensor->l; + m = tensor->m; + n = tensor->n; + + while (-1 != (k = next_tube(data))) { + offset = k*data->tensor->l; + for (i = 0; i < l; ++i) { + T[offset+i]; + // M[i][j] += P[k] * T[index]; + } + } + + return NULL; +} + +void +threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +{ + uint i; + pthread_t threads[32]; + int error; + int *status; + product_thread_data_t data; + + data.done = 0; + data.matrix = matrix; + data.vector = vector; + data.tensor = tensor; + + pthread_mutex_init(&tube_lock, NULL); + + for (i = 0; i < thread_count; ++i) { + if (0 != (error = pthread_create(&threads[i], NULL, fiber_product, &data))) { + die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error); + } + } + + for (i = 0; i < thread_count; ++i) { + if (0 != (error = pthread_join(threads[i], NULL))) { + die("threaded_n_mode_product_array: pthread_join() failed with %d\n", error); + } + } + + pthread_mutex_destroy(&tube_lock); +} + +void +n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +{ + uint i, j, k, index; + uint m, n, l; + uint *P; + double **M, *T; + + debug("n_mode_product_array(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + + M = matrix->data; + P = vector->data; + T = tensor->values; + + l = tensor->l; + m = tensor->m; + n = tensor->n; + + for (i = 0; i < m; ++i) { + for (j = 0; j < n; ++j) { + for (k = 0; k < l; ++k) { + index = tensor_index(tensor, i, j, k); + M[i][j] += P[k] * T[index]; + } + } + } +} + +void +threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +{ + debug("threaded_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + + compatible(vector, tensor); + + switch (tensor->strategy) { + case strategy::array: + n_mode_product_array(matrix, vector, tensor); + break; + default: + die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n", + strategy_to_string(tensor->strategy)); + break; + } +} From 01cbeb405f9157d515443698f5d9da2b336f4000 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 18:17:16 -0700 Subject: [PATCH 13/57] + Sample dense data --- results/dense3.in | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 results/dense3.in diff --git a/results/dense3.in b/results/dense3.in new file mode 100644 index 0000000..23bcbd8 --- /dev/null +++ b/results/dense3.in @@ -0,0 +1,29 @@ +%%MatrixMarket tensor array real general +3 3 3 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 From 3fcb1d2b743ddf9c17114b5418a38ca12a22e804 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 18:39:44 -0700 Subject: [PATCH 14/57] + Fixed typo in matrix file format enum: there was on 'unknown' entry --- src/matrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/matrix.h b/src/matrix.h index a8b32a8..e4a9693 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -7,7 +7,7 @@ namespace format { typedef enum { - format, + unknown, array, coordinate } type_t; From e54654c40c4e3fc363187accb5c9e4ac1c212262 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 18:40:38 -0700 Subject: [PATCH 15/57] + Added debugging output for matrix writing code + Decreased the number of decimal places printed to a file --- src/matrix_write.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/matrix_write.cc b/src/matrix_write.cc index 4f2b3cd..d8cc7f2 100644 --- a/src/matrix_write.cc +++ b/src/matrix_write.cc @@ -23,6 +23,8 @@ matrix_fwrite_array(FILE *file, matrix_t const *matrix) int result; MM_typecode type; + debug("matrix_fwrite_array(file=0x%x, matrix=0x%x)\n", file, matrix); + matrix_initialize_type(&type); mm_set_array(&type); @@ -33,10 +35,10 @@ matrix_fwrite_array(FILE *file, matrix_t const *matrix) if (0 != (result = mm_write_matrix_array_size(file, matrix->m, matrix->n))) { die("Failed to write matrix array size (%d).\n", result); } - + for (i = 0; i < matrix->m; ++i) { for (j = 0; j < matrix->n; ++j) { - fprintf(file, "%10.32g\n", matrix->data[i][j]); + fprintf(file, "%10.6g\n", matrix->data[i][j]); } } } @@ -48,6 +50,8 @@ matrix_fwrite_coordinate(FILE *file, matrix_t const *matrix) int nnz, result; MM_typecode type; + debug("matrix_fwrite_coordinate(file=0x%x, matrix=0x%x)\n", file, matrix); + matrix_initialize_type(&type); mm_set_coordinate(&type); @@ -71,7 +75,7 @@ matrix_fwrite_coordinate(FILE *file, matrix_t const *matrix) for (i = 0; i < matrix->m; ++i) { for (j = 0; j < matrix->n; ++j) { if (!might_as_well_be_zero(matrix->data[i][j])) { - fprintf(file, "%d %d %10.32g\n", i+1, j+1, matrix->data[i][j]); + fprintf(file, "%d %d %10.6g\n", i+1, j+1, matrix->data[i][j]); } } } @@ -80,6 +84,8 @@ matrix_fwrite_coordinate(FILE *file, matrix_t const *matrix) void matrix_fwrite(FILE *file, matrix_t const *matrix, format::type_t format) { + debug("matrix_fwrite(file=0x%x, matrix=0x%x)\n", file, matrix); + if (format::coordinate == format) { matrix_fwrite_coordinate(file, matrix); } else { @@ -91,7 +97,9 @@ void matrix_write(char const *filename, matrix_t const *matrix, format::type_t format) { FILE *file; - + + debug("matrix_write(0x%x)\n", file); + file = fopen_or_die(filename, "w+"); matrix_fwrite(file, matrix, format); fclose(file); From c16a4cf340b5b565976330f4a363968ab8d2a970 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 18:41:28 -0700 Subject: [PATCH 16/57] + Fixed typos in debug output statements --- src/tensor_write.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/tensor_write.cc b/src/tensor_write.cc index 8462f7c..8f69149 100644 --- a/src/tensor_write.cc +++ b/src/tensor_write.cc @@ -28,7 +28,7 @@ tensor_fwrite_array(FILE *file, tensor_t const *tensor) MM_typecode type; double ***T; - debug("tensor_write_array(file=0x%x, tensor=0x%x)\n", file, tensor); + debug("tensor_fwrite_array(file=0x%x, tensor=0x%x)\n", file, tensor); tensor_initialize_typecode(&type, strategy::array); @@ -36,7 +36,7 @@ tensor_fwrite_array(FILE *file, tensor_t const *tensor) die("Could not write Tensor Market banner (%d).\n", result); } - debug("tensor_write_array: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n); + debug("tensor_fwrite_array: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n); if (0 != (result = mm_write_tensor_array_size(file, tensor->l, tensor->m, tensor->n))) { die("Failed to write array tensor of size %d x %d x %d (%d).\n", tensor->l, tensor->m, tensor->n, result); @@ -60,7 +60,7 @@ tensor_fwrite_coordinate(FILE *file, tensor_t const *tensor) tensor_storage_coordinate_t *storage; coordinate_tuple_t *tuples; - debug("tensor_write_coordinate(file=0x%x, tensor=0x%x)\n", file, tensor); + debug("tensor_fwrite_coordinate(file=0x%x, tensor=0x%x)\n", file, tensor); tensor_initialize_typecode(&type, strategy::coordinate); @@ -68,8 +68,8 @@ tensor_fwrite_coordinate(FILE *file, tensor_t const *tensor) die("Could not write Tensor Market banner (%d).\n", result); } - debug("tensor_write_coordinate: non-zero values: actual=%d.\n", tensor->nnz); - debug("tensor_write_coordinate: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n); + debug("tensor_fwrite_coordinate: non-zero values: actual=%d.\n", tensor->nnz); + debug("tensor_fwrite_coordinate: l=%d, m=%d, n=%d.\n", tensor->l, tensor->m, tensor->n); if (0 != (result = mm_write_tensor_coordinate_size(file, tensor->l, tensor->m, tensor->n, tensor->nnz))) { die("Failed to write coordinate tensor of size %d (%d).\n", nnz, result); From 628f29a942cf2d50d2e0e939c7ca9e4f896cbba7 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 18:42:07 -0700 Subject: [PATCH 17/57] + Set correct default for the number of threads to use whist running in parallel --- src/tool_effectuate.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc index b16ce94..857c0cc 100644 --- a/src/tool_effectuate.cc +++ b/src/tool_effectuate.cc @@ -83,7 +83,7 @@ timed_matrix_write(int argc, char *argv[], int const offset, matrix_t const *mat } timer_start(&t); - matrix_fwrite(file, matrix, format::coordinate); + matrix_fwrite(file, matrix, format::array); timer_end(&t); print_elapsed_time(t); @@ -179,7 +179,8 @@ effectuate_tool_main(int argc, char *argv[]) int c; /* set the program's defaults */ - optcode = DEFAULT_OPERATION; + optcode = DEFAULT_OPERATION; + thread_count = DEFAULT_THREAD_COUNT; /* we will privide our own error messages */ opterr = 0; @@ -266,6 +267,7 @@ effectuate_tool_main(int argc, char *argv[]) /* print program options, for debugging purposes */ print_tool_options(); debug("effectuate_tool_main: operation='%s'\n", operation_to_string(optcode)); + debug("effectuate_tool_main: thread_count=%d\n", thread_count); /* if we are just running a simulation, then we only do one iteration; otherwise, it would be really slow */ From 32f53338f827826651205296747c5f786e98cc74 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 18:43:15 -0700 Subject: [PATCH 18/57] + Fixed a small typo to check for the number of threads to use during the n-mode product calculation --- src/operation_n_mode_product.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index ac0ee24..9805e6b 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -412,6 +412,9 @@ n_mode_product_ekmr(matrix_t *matrix, vector_t const *vector, tensor_t const *te } } +extern void +n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor); + void serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { @@ -420,6 +423,9 @@ serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const * compatible(vector, tensor); switch (tensor->strategy) { + case strategy::array: + n_mode_product_array(matrix, vector, tensor); + break; case strategy::compressed: n_mode_product_compressed(matrix, vector, tensor); break; @@ -447,7 +453,7 @@ operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t cons { debug("operation_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); - if (1 == thread_count) { + if (thread_count <= 1) { serial_n_mode_product(matrix, vector, tensor); } else { threaded_n_mode_product(matrix, vector, tensor); From 0104201f8d856a225815d009e7129bcd4b69f71d Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 21:12:56 -0700 Subject: [PATCH 19/57] + First multi-threaded version working --- src/operation_threaded_n_mode_product.cc | 65 +++++++++++------------- 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc index 2ebe875..b0a411d 100644 --- a/src/operation_threaded_n_mode_product.cc +++ b/src/operation_threaded_n_mode_product.cc @@ -38,41 +38,40 @@ typedef struct { static pthread_mutex_t tube_lock; int -next_tube(product_thread_data_t *data) +next_tube(product_thread_data_t *p) { uint k; pthread_mutex_lock(&tube_lock); - k = data->done++; + k = p->done++; pthread_mutex_unlock(&tube_lock); - return k < (data->tensor->m*data->tensor->n) ? k : -1; + return k < (p->tensor->n*p->tensor->n) ? k : -1; } void* fiber_product(void *arg) { - int k; - product_thread_data_t *data; - uint i, j, k, index; - uint m, n, l; - uint *P; - double **M, *T; - - data = (product_thread_data_t*) arg; - - M = matrix->data; - P = vector->data; - T = tensor->values; - - l = tensor->l; - m = tensor->m; - n = tensor->n; - - while (-1 != (k = next_tube(data))) { - offset = k*data->tensor->l; - for (i = 0; i < l; ++i) { - T[offset+i]; - // M[i][j] += P[k] * T[index]; + int t; + uint i, j, k, offset; + uint n; + uint *P; + double **M, *T; + product_thread_data_t *p; + + p = (product_thread_data_t*) arg; + + M = p->matrix->data; + P = p->vector->data; + T = p->tensor->values; + + n = p->tensor->n; + + while (-1 != (t = next_tube(p))) { + offset = t*n; + i = t/n; + j = t%n; + for (k = 0; k < n; ++k) { + M[i][j] += P[k] * T[offset+k]; } } @@ -85,7 +84,6 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t uint i; pthread_t threads[32]; int error; - int *status; product_thread_data_t data; data.done = 0; @@ -94,7 +92,7 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t data.tensor = tensor; pthread_mutex_init(&tube_lock, NULL); - + for (i = 0; i < thread_count; ++i) { if (0 != (error = pthread_create(&threads[i], NULL, fiber_product, &data))) { die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error); @@ -114,23 +112,20 @@ void n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { uint i, j, k, index; - uint m, n, l; + uint n; uint *P; double **M, *T; debug("n_mode_product_array(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + n = tensor->n; M = matrix->data; P = vector->data; T = tensor->values; - l = tensor->l; - m = tensor->m; - n = tensor->n; - - for (i = 0; i < m; ++i) { + for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { - for (k = 0; k < l; ++k) { + for (k = 0; k < n; ++k) { index = tensor_index(tensor, i, j, k); M[i][j] += P[k] * T[index]; } @@ -147,7 +142,7 @@ threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const switch (tensor->strategy) { case strategy::array: - n_mode_product_array(matrix, vector, tensor); + threaded_n_mode_product_array(matrix, vector, tensor); break; default: die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n", From b2eeb3c43c2740411f798b41846a96f9c77ef49d Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 7 Nov 2011 22:25:42 -0700 Subject: [PATCH 20/57] + Added a local sum variable to use to later write to the shared matrix data structure --- src/operation_threaded_n_mode_product.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc index b0a411d..557874e 100644 --- a/src/operation_threaded_n_mode_product.cc +++ b/src/operation_threaded_n_mode_product.cc @@ -53,7 +53,7 @@ fiber_product(void *arg) { int t; uint i, j, k, offset; - uint n; + uint n, sum; uint *P; double **M, *T; product_thread_data_t *p; @@ -67,12 +67,14 @@ fiber_product(void *arg) n = p->tensor->n; while (-1 != (t = next_tube(p))) { + sum = 0; offset = t*n; i = t/n; j = t%n; for (k = 0; k < n; ++k) { - M[i][j] += P[k] * T[offset+k]; + sum += P[k] * T[offset+k]; } + M[i][j] = sum; } return NULL; From d84724382e5b7e75457eeb6bbaec9827940a7c0b Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Tue, 8 Nov 2011 04:59:42 -0700 Subject: [PATCH 21/57] + Stride based access to tubes, to improve cache performance --- src/operation_threaded_n_mode_product.cc | 97 +++++++++++++++++++++++- 1 file changed, 93 insertions(+), 4 deletions(-) diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc index 557874e..071d77b 100644 --- a/src/operation_threaded_n_mode_product.cc +++ b/src/operation_threaded_n_mode_product.cc @@ -29,7 +29,9 @@ extern uint thread_count; */ typedef struct { + uint *pdone; uint done; + uint id, offset, i; matrix_t *matrix; vector_t const *vector; tensor_t const *tensor; @@ -38,10 +40,11 @@ typedef struct { static pthread_mutex_t tube_lock; int -next_tube(product_thread_data_t *p) +serial_next_tube(product_thread_data_t *p) { - uint k; + uint k; + pthread_mutex_lock(&tube_lock); k = p->done++; pthread_mutex_unlock(&tube_lock); @@ -49,7 +52,7 @@ next_tube(product_thread_data_t *p) } void* -fiber_product(void *arg) +serial_fiber_product(void *arg) { int t; uint i, j, k, offset; @@ -66,7 +69,7 @@ fiber_product(void *arg) n = p->tensor->n; - while (-1 != (t = next_tube(p))) { + while (-1 != (t = serial_next_tube(p))) { sum = 0; offset = t*n; i = t/n; @@ -80,6 +83,91 @@ fiber_product(void *arg) return NULL; } +int +padded_next_tube(product_thread_data_t *p) +{ + uint k, choise; + + if (p->i < 10) { + choise = p->offset + p->i++; + } else { + p->offset += p->i*p->tensor->n; + p->i = 0; + choise = p->offset; + } + + pthread_mutex_lock(&tube_lock); + k = (*p->pdone)++; + pthread_mutex_unlock(&tube_lock); + return k < (p->tensor->n*p->tensor->n) ? choise : -1; +} + +void* +padded_fiber_product(void *arg) +{ + int t; + uint i, j, k, offset; + uint n, sum; + uint *P; + double **M, *T; + product_thread_data_t *p; + + p = (product_thread_data_t*) arg; + + M = p->matrix->data; + P = p->vector->data; + T = p->tensor->values; + n = p->tensor->n; + + while (-1 != (t = padded_next_tube(p))) { + sum = 0; + offset = t*n; + i = t/n; + j = t%n; + for (k = 0; k < n; ++k) { + sum += P[k] * T[offset+k]; + } + M[i][j] = sum; + } + + return NULL; +} + +void +threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +{ + uint i, done; + uint n; + pthread_t threads[32]; + int error; + product_thread_data_t data[32]; + + n = tensor->n; + pthread_mutex_init(&tube_lock, NULL); + + for (i = 0; i < thread_count; ++i) { + data[i].pdone = &done; + data[i].matrix = matrix; + data[i].vector = vector; + data[i].tensor = tensor; + data[i].offset = i*n; + data[i].i = 0; + data[i].id = i; + if (0 != (error = pthread_create(&threads[i], NULL, padded_fiber_product, &data[i]))) { + die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error); + } + } + + for (i = 0; i < thread_count; ++i) { + if (0 != (error = pthread_join(threads[i], NULL))) { + die("threaded_n_mode_product_array: pthread_join() failed with %d\n", error); + } + } + + pthread_mutex_destroy(&tube_lock); +} + +#if 0 void threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { @@ -109,6 +197,7 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t pthread_mutex_destroy(&tube_lock); } +#endif void n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) From 19fb803312972af18e07a5e1311cabdb85c40729 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Tue, 8 Nov 2011 05:33:44 -0700 Subject: [PATCH 22/57] + Test padding for improved scaling --- src/operation_threaded_n_mode_product.cc | 39 +++++++++++++++--------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc index 071d77b..c0d4e4e 100644 --- a/src/operation_threaded_n_mode_product.cc +++ b/src/operation_threaded_n_mode_product.cc @@ -29,9 +29,9 @@ extern uint thread_count; */ typedef struct { - uint *pdone; + uint *pdone, *dummy; uint done; - uint id, offset, i; + uint id, offset, i, stride; matrix_t *matrix; vector_t const *vector; tensor_t const *tensor; @@ -42,9 +42,8 @@ static pthread_mutex_t tube_lock; int serial_next_tube(product_thread_data_t *p) { - uint k; + uint k; - pthread_mutex_lock(&tube_lock); k = p->done++; pthread_mutex_unlock(&tube_lock); @@ -88,14 +87,16 @@ padded_next_tube(product_thread_data_t *p) { uint k, choise; - if (p->i < 10) { + if (p->i < p->stride) { choise = p->offset + p->i++; } else { - p->offset += p->i*p->tensor->n; - p->i = 0; + p->offset += p->stride; + p->i = 1; choise = p->offset; } + //message("offset=%d\n", p->offset); + pthread_mutex_lock(&tube_lock); k = (*p->pdone)++; pthread_mutex_unlock(&tube_lock); @@ -136,24 +137,32 @@ padded_fiber_product(void *arg) void threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { + uint stride; uint i, done; uint n; pthread_t threads[32]; int error; product_thread_data_t data[32]; + //message("sizeof(data)=%d\n", sizeof(data)); + n = tensor->n; pthread_mutex_init(&tube_lock, NULL); + done = 0; + stride = 32 > tensor->n ? tensor->n : 2; + thread_count = thread_count > tensor->n ? tensor->n : thread_count; + for (i = 0; i < thread_count; ++i) { - data[i].pdone = &done; - data[i].matrix = matrix; - data[i].vector = vector; - data[i].tensor = tensor; - data[i].offset = i*n; - data[i].i = 0; - data[i].id = i; - if (0 != (error = pthread_create(&threads[i], NULL, padded_fiber_product, &data[i]))) { + data[i+2].pdone = &done; + data[i+2].matrix = matrix; + data[i+2].vector = vector; + data[i+2].tensor = tensor; + data[i+2].offset = i*stride; + data[i+2].i = 0; + data[i+2].stride = stride; + data[i+2].id = i; + if (0 != (error = pthread_create(&threads[i], NULL, serial_fiber_product, &data[i+2]))) { die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error); } } From bfca0f6c362a84879522b4091947bf03f8e86750 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Tue, 8 Nov 2011 19:51:05 -0700 Subject: [PATCH 23/57] + Added simplified threading model --- src/Makefile | 6 +- src/thread.cc | 213 +++++++++++++++++++++++++++++++++++++ src/thread.h | 283 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 499 insertions(+), 3 deletions(-) create mode 100644 src/thread.cc create mode 100644 src/thread.h diff --git a/src/Makefile b/src/Makefile index 6265660..91406cc 100644 --- a/src/Makefile +++ b/src/Makefile @@ -20,8 +20,8 @@ EXTRA_LDFLAGS=-Wall -lpthread $(EXTRA_DEBUG) HEADERS_CACHE=address.h cache.h hash.h HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h \ - memory.h operation.h random.h strings.h timer.h tool.h \ - utility.h compatible.h + memory.h operation.h random.h thread.h strings.h timer.h \ + tool.h utility.h compatible.h HEADERS_GENERATE=generate.h HEADERS_MATRIX=matrix.h mmio.h HEADERS_TENSOR=storage.h tensor.h @@ -34,7 +34,7 @@ SOURCES_GENERAL=arithmetic.cc compatible.cc error.cc file.cc \ information.cc latex.cc memory.cc mmio.cc \ operation_n_mode_product.cc \ operation_threaded_n_mode_product.cc operation_utility.cc \ - random.cc strings.cc timer.cc tool_convert.cc \ + random.cc strings.cc thread.cc timer.cc tool_convert.cc \ tool_effectuate.cc tool_generate.cc tool_permute.cc \ tool_timing.cc tool_utility.cc types.cc utility.cc SOURCES_GENERATE=generate_tensor_from_matrix.cc diff --git a/src/thread.cc b/src/thread.cc new file mode 100644 index 0000000..eec77dd --- /dev/null +++ b/src/thread.cc @@ -0,0 +1,213 @@ +/*********************************************************************** + * pt.c -- thread utility routines + * + * Author: Mark Hays + */ + +#include "thread.h" + +#include +#include +#include /* for EBUSY */ + +/************************************************* + * attempt to lock a mutex + */ +int thread_mutex_trylock(pthread_mutex_t *m) +{ + int res; + + /* returns EBUSY if mutex is already locked, + * and EINVAL if the ptr is bad (on RedHat5.0) + * + * might this return EAGAIN on some systems?? + * i can't find any docs on this one's retval! + * + */ + if ((res=pthread_mutex_trylock(m)) != EBUSY) { + THREAD_DIE("thread_mutex_trylock",res); + } + return(res ? 1 : 0); +} + +void +thread_wait(pthread_t *thread, thread_address_t exitcode) +{ + thread_address_t code, p; + int errcode; + + p = (thread_address_t) ((exitcode)==NULL ? &code : exitcode); + if ((errcode=pthread_join(*thread, &p))) { + THREAD_DIE("thread_wait", errcode); + } +} + +/************************************************* + * run nthreads threads in the routine start + */ +void _thread_fork(int nthreads, + thread_function_t start, + thread_address_t arg, + thread_address_t *exitcodes) +{ + int i; + thread_argument_t *args; + thread_address_t *address; + + if (nthreads<1) { + die("thread_mutex_trylock: nthreads<1\n"); + } + if ((args=(thread_argument_t *) malloc(nthreads*sizeof(thread_argument_t)))==NULL) { + die("thread_fork: malloc failed!\n"); + } + for (i=0; ingate=0; gate->nthreads=nthreads; + thread_mutex_init( &gate->mutex); + thread_mutex_init( &gate->block); + thread_cond_init (&gate->condvar); + thread_cond_init ( &gate->last); +} + +/************************************************* + * destroy a gate variable + */ +void thread_gate_destroy(thread_gate_t *gate) +{ + gate->ngate=gate->nthreads=0; + thread_mutex_destroy( &gate->mutex); + thread_mutex_destroy( &gate->block); + thread_cond_destroy (&gate->condvar); + thread_cond_destroy ( &gate->last); +} + +/************************************************* + * enter the gate + */ +void thread_gate_sync(thread_gate_t *gate) +{ + if (gate->nthreads<2) return; /* trivial case */ + thread_mutex_lock(&gate->block); /* lock the block -- new + threads sleep here */ + thread_mutex_lock(&gate->mutex); /* lock the mutex */ + if (++(gate->ngate) < gate->nthreads) { /* are we the last one in? */ + thread_mutex_unlock(&gate->block); /* no, unlock block and */ + thread_cond_wait(&gate->condvar, /* go to sleep */ + &gate->mutex); + } else { /* yes, we're last */ + thread_cond_broadcast(&gate->condvar); /* wake everyone up and */ + thread_cond_wait(&gate->last,&gate->mutex); /* go to sleep til they're + all awake... then */ + thread_mutex_unlock(&gate->block); /* release the block */ + } + if (--(gate->ngate)==1) { /* next to last one out? */ + thread_cond_broadcast(&gate->last); /* yes, wake up last one */ + } + thread_mutex_unlock(&gate->mutex); /* release the mutex */ +} + +/************************************************* + * Pipeline stage: the idea: + * + * main thread I/O thread + * \ / \ + * \ / \ + * gate1 | + * / \ | + * / \ | + * setup | work + * \ / | + * \ / | + * gate2 | + * / \ / + * / \_______/ + * | + * main continues + */ + +/************************************************* + * couple of convenient macros + */ +#define GATE1(pipeline) thread_gate_sync(&((pipeline)->gate1)) +#define GATE2(pipeline) thread_gate_sync(&((pipeline)->gate2)) +#define STAGE(pipeline) (*((pipeline)->stageproc))((pipeline)->gdata) +#define SETUP(pipeline) \ + { thread_function_t fp; \ + \ + if ((fp=(pipeline)->setupproc)!=NULL) (*fp)(pipeline->gdata); \ + } + +/************************************************* + * slave thread executes this + */ +static void _thread_pipeline_slave_code(thread_pipeline_t *pipeline) +{ + while (1) { + GATE1(pipeline); + if (pipeline->terminate) break; + GATE2(pipeline); + STAGE(pipeline); + } + thread_exit(NULL); +} + +/************************************************* + * init the info struct and start up the slave + */ +void _thread_pipeline_init(thread_pipeline_t *pipeline, + thread_address_t gdata, + thread_function_t setupproc, + thread_function_t stageproc) +{ + thread_gate_init(&(pipeline->gate1),2); + thread_gate_init(&(pipeline->gate2),2); + pipeline->terminate=0; + pipeline->gdata=gdata; + pipeline->setupproc=setupproc; + pipeline->stageproc=stageproc; + thread_create(&(pipeline->slave),_thread_pipeline_slave_code,pipeline); +} + +/************************************************* + * kill the slave, free resources + */ +void thread_pipeline_destroy(thread_pipeline_t *pipeline) +{ + pipeline->terminate=1; + GATE1(pipeline); + thread_wait(&(pipeline->slave),NULL); + thread_gate_destroy(&(pipeline->gate1)); + thread_gate_destroy(&(pipeline->gate2)); + pipeline->gdata=NULL; + pipeline->setupproc=NULL; + pipeline->stageproc=NULL; +} + +/************************************************* + * run the pipeline stage + */ +void thread_pipeline_execute(thread_pipeline_t *pipeline) +{ + GATE1(pipeline); + SETUP(pipeline); + GATE2(pipeline); +} + +/* EOF pt.c */ + diff --git a/src/thread.h b/src/thread.h new file mode 100644 index 0000000..8099434 --- /dev/null +++ b/src/thread.h @@ -0,0 +1,283 @@ +/*********************************************************************** + * thread.h, based on: + * pt.h -- pthreads utility macros + * + * Author: Mark Hays + */ + +#ifndef _THREAD_H_ +#define _THREAD_H_ + +/* Linux defs: + * _REENTRANT to get thread-safe libs + * _POSIX_SOURCE to get POSIX semantics + * _P is a hack for LinuxThreads -- on my box, + * pthread.h includes sched.h. My sched.h + * (incorrectly) declares prototypes with + * _P instead of __P (which is what everything + * else uses... Maybe it's just me. + */ +#ifdef __linux__ +# define _REENTRANT +# define _POSIX_SOURCE +# define _P __P +#endif + +#include +#include +#include "error.h" + +typedef void *(*thread_function_t)(void *); +typedef void *thread_address_t; + +#define THREAD_DIE(func,errcode) \ + die("%s:%d: %s: %s.\n",__FILE__,__LINE__,func,strerror(errcode)); + +/************************************************* + * low level wrappers that die on errors + */ +#define thread_create(t,start,arg) \ +{ \ + int errcode; \ + \ + if ((errcode=pthread_create(t, \ + NULL, \ + (thread_function_t) (start), \ + (thread_address_t) (arg)))) { \ + THREAD_DIE("thread_create", errcode); \ + } \ +} + +#define thread_create_detached(start,arg) \ +{ \ + pthread_t t; \ + int errcode; \ + \ + if ((errcode=pthread_create(&t, \ + NULL, \ + (thread_function_t) (start), \ + (thread_address_t) (arg)))) { \ + THREAD_DIE("thread_create_detached", errcode); \ + } \ + if (pthread_detach(t)) { \ + THREAD_DIE("thread_create_detached", errcode); \ + } \ +} + +void thread_wait(pthread_t thread, thread_address_t exitcode); + +#if 0 +#define thread_wait(t,exitcode) \ +{ \ + thread_address_t code; \ + int errcode; \ + \ + if ((errcode=pthread_join(*(t), \ + (thread_address_t) ((exitcode)==NULL ? &code : (exitcode))))) { \ + THREAD_DIE("thread_wait", errcode); \ + } \ +} +#endif + +#define thread_exit(status) \ +{ \ + pthread_exit(status); \ +} + +#define thread_mutex_init(m) \ +{ \ + int errcode; \ + \ + if ((errcode=pthread_mutex_init(m,NULL))) { \ + THREAD_DIE("thread_mutex_init", errcode); \ + } \ +} + +#define thread_mutex_destroy(m) \ +{ \ + int errcode; \ + \ + if ((errcode=pthread_mutex_destroy(m))) { \ + THREAD_DIE("thread_mutex_destroy", errcode); \ + } \ +} + +#define thread_cond_init(c) \ +{ \ + int errcode; \ + \ + if ((errcode=pthread_cond_init(c,NULL))) { \ + THREAD_DIE("thread_cond_init", errcode); \ + } \ +} + +#define thread_cond_destroy(c) \ +{ \ + int errcode; \ + \ + if ((errcode=pthread_cond_destroy(c))) { \ + THREAD_DIE("thread_cond_destroy", errcode); \ + } \ +} + +#define thread_mutex_lock(m) \ +{ \ + int errcode; \ + \ + if ((errcode=pthread_mutex_lock(m))) { \ + THREAD_DIE("thread_mutex_lock", errcode); \ + } \ +} + +/* This one has to do some extra checking so it + * isn't a macro... + */ +extern int thread_mutex_trylock(pthread_mutex_t *m, char *msg); + +#define thread_mutex_unlock(m) \ +{ \ + int errcode; \ + \ + if ((errcode=pthread_mutex_unlock(m))) { \ + THREAD_DIE("thread_mutex_unlock", errcode); \ + } \ +} + +#define thread_cond_wait(c,m) \ +{ \ + int errcode; \ + \ + if ((errcode=pthread_cond_wait(c,m))) { \ + THREAD_DIE("thread_cond_wait", errcode); \ + } \ +} + +#define thread_cond_broadcast(c) \ +{ \ + int errcode; \ + \ + if ((errcode=pthread_cond_broadcast(c))) { \ + THREAD_DIE("thread_cond_broadcast", errcode); \ + } \ +} + +#define thread_cond_signal(c) \ +{ \ + int errcode; \ + \ + if ((errcode=pthread_cond_signal(c))) { \ + THREAD_DIE("thread_cond_signal", errcode); \ + } \ +} + +/************************************************* + * N threads simultaneously doing the same thing + */ + +typedef struct _thread_argument_t_ { + int myid; + int nthreads; + pthread_t self; + thread_address_t data; +} thread_argument_t; + +#define thread_myid(th_arg) ((th_arg)->myid) +#define thread_nthreads(th_arg) ((th_arg)->nthreads) +#define thread_data(th_arg) ((th_arg)->data) +#define thread_self(th_arg) (&((th_arg)->self)) +#define thread_thread(th_arg,id) (&(((th_arg)-((th_arg)->myid)+(id))->self)) + +#define thread_cancel(th_arg,id) \ +{ \ + int errcode; \ + \ + if ((errcode=pthread_cancel(((th_arg)-((th_arg)->myid)+(id))->self))) { \ + THREAD_DIE("thread_cancel", errcode); \ + } \ +} + +#define thread_cancel_all(th_arg) \ +{ \ + int myid=(th_arg)->myid,nt=(th_arg)->nthreads,i,errcode; \ + thread_argument_t *base=(th_arg)-myid; \ + \ + for (i=0; i Date: Tue, 8 Nov 2011 19:52:41 -0700 Subject: [PATCH 24/57] + Added stride parameter, to allow seperat threads to access memory that is far away for other threads --- src/main.cc | 1 + src/tool.h | 1 + src/tool_effectuate.cc | 18 ++++++++++++++---- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/main.cc b/src/main.cc index 2b893fa..ee64dec 100644 --- a/src/main.cc +++ b/src/main.cc @@ -21,6 +21,7 @@ cache_t *cache; uint cache_size; uint cache_line_size; uint iterations; +uint memory_stride; uint seed; uint thread_count; char *tool_name; diff --git a/src/tool.h b/src/tool.h index 851792d..f687070 100644 --- a/src/tool.h +++ b/src/tool.h @@ -24,6 +24,7 @@ namespace tool { #define DEFAULT_HUMAN_READABLE true #define DEFAULT_ITERATIONS 1 +#define DEFAULT_MEMORY_STRIDE 32 #define DEFAULT_OPERATION operation::n_mode_product #define DEFAULT_ORIENTATION orientation::row #define DEFAULT_PERMUTATION_HEURISTIC permutation_heuristic::none diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc index 857c0cc..314d116 100644 --- a/src/tool_effectuate.cc +++ b/src/tool_effectuate.cc @@ -21,8 +21,9 @@ extern cache_t *cache; extern uint cache_size; extern uint cache_line_size; -extern uint iterations; extern bool human_readable; +extern uint iterations; +extern uint memory_stride; extern uint thread_count; extern char *tool_name; extern tool::type_t tool_type; @@ -179,14 +180,16 @@ effectuate_tool_main(int argc, char *argv[]) int c; /* set the program's defaults */ - optcode = DEFAULT_OPERATION; - thread_count = DEFAULT_THREAD_COUNT; + memory_stride = DEFAULT_MEMORY_STRIDE; + optcode = DEFAULT_OPERATION; + thread_count = DEFAULT_THREAD_COUNT; + /* we will privide our own error messages */ opterr = 0; /* extract any command-line options the user provided */ - while (-1 != (c = getopt(argc, argv, ":hl:m:n:o:p:st:TuvV:w"))) { + while (-1 != (c = getopt(argc, argv, ":hl:m:n:o:p:r:st:TuvV:w"))) { switch (c) { case 'h': effectuate_tool_usage(); @@ -216,6 +219,12 @@ effectuate_tool_main(int argc, char *argv[]) optcode = string_to_operation(optarg); } break; + case 'r': + memory_stride = atoi(optarg); + if (0 == memory_stride) { + memory_stride = DEFAULT_MEMORY_STRIDE; + } + break; case 's': simulate = !simulate; break; @@ -267,6 +276,7 @@ effectuate_tool_main(int argc, char *argv[]) /* print program options, for debugging purposes */ print_tool_options(); debug("effectuate_tool_main: operation='%s'\n", operation_to_string(optcode)); + debug("effectuate_tool_main: memory_stride=%d\n", memory_stride); debug("effectuate_tool_main: thread_count=%d\n", thread_count); /* if we are just running a simulation, then we only do one From 3f997364189ae8f0c76a4e14d7578e7b9d1615a9 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Tue, 8 Nov 2011 19:54:24 -0700 Subject: [PATCH 25/57] + Updated tensor product to use siple threading model --- src/operation_threaded_n_mode_product.cc | 133 +++++------------------ 1 file changed, 29 insertions(+), 104 deletions(-) diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc index c0d4e4e..181696f 100644 --- a/src/operation_threaded_n_mode_product.cc +++ b/src/operation_threaded_n_mode_product.cc @@ -5,13 +5,14 @@ #include "matrix.h" #include "operation.h" #include "tensor.h" +#include "thread.h" #include "utility.h" #include "vector.h" #include #include -#include extern cache_t *cache; +extern uint memory_stride; extern uint thread_count; /* @@ -29,9 +30,7 @@ extern uint thread_count; */ typedef struct { - uint *pdone, *dummy; uint done; - uint id, offset, i, stride; matrix_t *matrix; vector_t const *vector; tensor_t const *tensor; @@ -40,87 +39,34 @@ typedef struct { static pthread_mutex_t tube_lock; int -serial_next_tube(product_thread_data_t *p) +serial_next_tube(product_thread_data_t *data) { uint k; - pthread_mutex_lock(&tube_lock); - k = p->done++; - pthread_mutex_unlock(&tube_lock); - return k < (p->tensor->n*p->tensor->n) ? k : -1; + thread_mutex_lock(&tube_lock); + k = data->done++; + thread_mutex_unlock(&tube_lock); + return k < (data->tensor->n*data->tensor->n) ? k : -1; } -void* -serial_fiber_product(void *arg) +thread_address_t +serial_fiber_product(thread_argument_t *argument) { int t; uint i, j, k, offset; uint n, sum; uint *P; double **M, *T; - product_thread_data_t *p; + product_thread_data_t *data; - p = (product_thread_data_t*) arg; + data = (product_thread_data_t*) thread_data(argument); - M = p->matrix->data; - P = p->vector->data; - T = p->tensor->values; + n = data->tensor->n; + M = data->matrix->data; + P = data->vector->data; + T = data->tensor->values; - n = p->tensor->n; - - while (-1 != (t = serial_next_tube(p))) { - sum = 0; - offset = t*n; - i = t/n; - j = t%n; - for (k = 0; k < n; ++k) { - sum += P[k] * T[offset+k]; - } - M[i][j] = sum; - } - - return NULL; -} - -int -padded_next_tube(product_thread_data_t *p) -{ - uint k, choise; - - if (p->i < p->stride) { - choise = p->offset + p->i++; - } else { - p->offset += p->stride; - p->i = 1; - choise = p->offset; - } - - //message("offset=%d\n", p->offset); - - pthread_mutex_lock(&tube_lock); - k = (*p->pdone)++; - pthread_mutex_unlock(&tube_lock); - return k < (p->tensor->n*p->tensor->n) ? choise : -1; -} - -void* -padded_fiber_product(void *arg) -{ - int t; - uint i, j, k, offset; - uint n, sum; - uint *P; - double **M, *T; - product_thread_data_t *p; - - p = (product_thread_data_t*) arg; - - M = p->matrix->data; - P = p->vector->data; - T = p->tensor->values; - n = p->tensor->n; - - while (-1 != (t = padded_next_tube(p))) { + while (-1 != (t = serial_next_tube(data))) { sum = 0; offset = t*n; i = t/n; @@ -137,43 +83,22 @@ padded_fiber_product(void *arg) void threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { - uint stride; - uint i, done; - uint n; - pthread_t threads[32]; - int error; - product_thread_data_t data[32]; - - //message("sizeof(data)=%d\n", sizeof(data)); + product_thread_data_t data; - n = tensor->n; - pthread_mutex_init(&tube_lock, NULL); + memory_stride = memory_stride > tensor->n ? tensor->n : memory_stride; + thread_count = thread_count > tensor->n ? tensor->n : thread_count; - done = 0; - stride = 32 > tensor->n ? tensor->n : 2; - thread_count = thread_count > tensor->n ? tensor->n : thread_count; + debug("threaded_n_mode_product_array: memory_stride=%d\n", memory_stride); + debug("threaded_n_mode_product_array: thread_count=%d\n", thread_count); - for (i = 0; i < thread_count; ++i) { - data[i+2].pdone = &done; - data[i+2].matrix = matrix; - data[i+2].vector = vector; - data[i+2].tensor = tensor; - data[i+2].offset = i*stride; - data[i+2].i = 0; - data[i+2].stride = stride; - data[i+2].id = i; - if (0 != (error = pthread_create(&threads[i], NULL, serial_fiber_product, &data[i+2]))) { - die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error); - } - } - - for (i = 0; i < thread_count; ++i) { - if (0 != (error = pthread_join(threads[i], NULL))) { - die("threaded_n_mode_product_array: pthread_join() failed with %d\n", error); - } - } + data.done = 0; + data.matrix = matrix; + data.vector = vector; + data.tensor = tensor; - pthread_mutex_destroy(&tube_lock); + thread_mutex_init(&tube_lock); + thread_fork(thread_count, serial_fiber_product, &data, NULL); + thread_mutex_destroy(&tube_lock); } #if 0 @@ -193,7 +118,7 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t pthread_mutex_init(&tube_lock, NULL); for (i = 0; i < thread_count; ++i) { - if (0 != (error = pthread_create(&threads[i], NULL, fiber_product, &data))) { + if (0 != (error = pthread_create(&threads[i], NULL, (void* (*)(void*))serial_fiber_product, &data))) { die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error); } } From b354da5972b51bc0bf3156679d2e2c41a75278ec Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Wed, 9 Nov 2011 12:11:58 -0700 Subject: [PATCH 26/57] + Removed sparse tensor support --- src/Makefile | 32 +- src/compatible.cc | 4 - src/generate_tensor_from_matrix.cc | 17 +- src/information.cc | 2 +- src/main.cc | 2 - src/operation_n_mode_product.cc | 460 +++++------------------ src/operation_threaded_n_mode_product.cc | 177 --------- src/tensor.h | 66 +--- src/tensor_clear.cc | 80 +--- src/tensor_copy.cc | 3 +- src/tensor_emit_latex.cc | 57 +-- src/tensor_free.cc | 68 ---- src/tensor_malloc.cc | 19 +- src/tensor_read.cc | 8 + src/tensor_utility.cc | 53 --- src/tensor_validate.cc | 4 - src/tensor_write.cc | 4 + src/tool.h | 6 +- 18 files changed, 133 insertions(+), 929 deletions(-) delete mode 100644 src/operation_threaded_n_mode_product.cc diff --git a/src/Makefile b/src/Makefile index 91406cc..d59896a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -24,7 +24,7 @@ HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h \ tool.h utility.h compatible.h HEADERS_GENERATE=generate.h HEADERS_MATRIX=matrix.h mmio.h -HEADERS_TENSOR=storage.h tensor.h +HEADERS_TENSOR=tensor.h HEADERS_VECTOR=vector.h HEADERS=$(HEADERS_CACHE) $(HEADERS_GENERAL) $(HEADERS_GENERATE) \ $(HEADERS_MATRIX) $(HEADERS_TENSOR) $(HEADERS_VECTOR) @@ -32,33 +32,23 @@ HEADERS=$(HEADERS_CACHE) $(HEADERS_GENERAL) $(HEADERS_GENERATE) \ SOURCES_CACHE=address.cc cache.cc hash.cc SOURCES_GENERAL=arithmetic.cc compatible.cc error.cc file.cc \ information.cc latex.cc memory.cc mmio.cc \ - operation_n_mode_product.cc \ - operation_threaded_n_mode_product.cc operation_utility.cc \ - random.cc strings.cc thread.cc timer.cc tool_convert.cc \ - tool_effectuate.cc tool_generate.cc tool_permute.cc \ - tool_timing.cc tool_utility.cc types.cc utility.cc + operation_n_mode_product.cc operation_utility.cc random.cc \ + strings.cc thread.cc timer.cc tool_effectuate.cc \ + tool_generate.cc tool_timing.cc tool_utility.cc types.cc \ + utility.cc SOURCES_GENERATE=generate_tensor_from_matrix.cc SOURCES_MATRIX=matrix_arithmetic.cc matrix_clear.cc \ matrix_compatible.cc matrix_copy.cc matrix_free.cc \ matrix_malloc.cc matrix_partition.cc matrix_supported.cc \ matrix_read.cc matrix_write.cc -SOURCES_STORAGE=tensor_storage_convert.cc \ - tensor_storage_compressed.cc \ - tensor_storage_compressed_slice.cc \ - tensor_storage_coordinate.cc tensor_storage_matrix_slice.cc \ - tensor_emit_latex.cc tensor_storage_ekmr.cc \ - tensor_storage_gundersen.cc tensor_storage_malloc.cc \ - tensor_storage_utility.cc tensor_storage_zzekmr.cc -SOURCES_TENSOR=tensor_arithmetic.cc tensor_clear.cc tensor_convert.cc \ - tensor_copy.cc tensor_free.cc tensor_malloc.cc \ - tensor_ownership.cc tensor_permute.cc tensor_supported.cc \ - tensor_read.cc tensor_write.cc tensor_utility.cc \ - tensor_validate.cc +SOURCES_TENSOR=tensor_arithmetic.cc tensor_clear.cc tensor_copy.cc \ + tensor_emit_latex.cc tensor_free.cc tensor_malloc.cc \ + tensor_ownership.cc tensor_supported.cc tensor_read.cc \ + tensor_write.cc tensor_utility.cc tensor_validate.cc SOURCES_VECTOR=vector_clear.cc vector_free.cc vector_malloc.cc \ vector_read.cc vector_write.cc -SOURCES=$(SOURCES_CACHE) $(SOURCES_GENERAL) $(SOURCES_GENERATE) \ - $(SOURCES_MATRIX) $(SOURCES_STORAGE) $(SOURCES_TENSOR) \ - $(SOURCES_VECTOR) main.cc +SOURCES=$(SOURCES_CACHE) $(SOURCES_GENERAL) $(SOURCES_GENERATE) \ + $(SOURCES_MATRIX) $(SOURCES_TENSOR) $(SOURCES_VECTOR) main.cc ASSEMBLER=$(SOURCES:.cc=.s) OBJECTS=$(ASSEMBLER:.s=.o) diff --git a/src/compatible.cc b/src/compatible.cc index 2f7ddfe..a46c7b5 100644 --- a/src/compatible.cc +++ b/src/compatible.cc @@ -87,10 +87,6 @@ compatible(tensor_t const *lhs, tensor_t const *rhs) { debug("compatible(tensor=0x%x, tensor=0x%x)\n", lhs, rhs); - if (lhs->nnz != rhs->nnz) { - die("Tensors do not have the same number non-zero entries.\n"); - } - if (lhs->l != rhs->l || lhs->m != rhs->m || lhs->n != rhs->n) { die("Tensors do not have the same dimensions.\n"); } diff --git a/src/generate_tensor_from_matrix.cc b/src/generate_tensor_from_matrix.cc index 468b4b9..2f4fd91 100644 --- a/src/generate_tensor_from_matrix.cc +++ b/src/generate_tensor_from_matrix.cc @@ -26,12 +26,11 @@ tensor_t* generate_tensor_from_matrix(matrix_t *matrix) { +#if 0 uint i, j, k; - uint nnz, size, n; + uint size, n; uint lower, upper; tensor_t *tensor; - tensor_storage_coordinate_t *storage; - coordinate_tuple_t *tuples; double *values; double **data; @@ -40,15 +39,6 @@ generate_tensor_from_matrix(matrix_t *matrix) n = matrix->n; upper = matrix->n*matrix->n; data = matrix->data; - nnz = 0; - - for (i = 0; i < n; ++i) { - for (j = 0; j < n; ++j) { - if (!might_as_well_be_zero(data[i][j])) { - nnz++; - } - } - } lower = nnz; nnz *= n; @@ -85,4 +75,7 @@ generate_tensor_from_matrix(matrix_t *matrix) } return tensor; +#endif + + return NULL; } diff --git a/src/information.cc b/src/information.cc index 5425b9d..4a71dd7 100644 --- a/src/information.cc +++ b/src/information.cc @@ -20,6 +20,6 @@ void print_information(tensor_t const* tensor) { debug("print_information(tensor=0x%x)\n", tensor); - debug("l=%d, m=%d, n=%d, nnz=%d\n", tensor->l, tensor->m, tensor->n, tensor->nnz); + debug("l=%d, m=%d, n=%d, nnz=%d\n", tensor->l, tensor->m, tensor->n); debug("strategy='%s', orientation='%s'\n", strategy_to_string(tensor->strategy), orientation_to_string(tensor->orientation)); } diff --git a/src/main.cc b/src/main.cc index ee64dec..b84b7bb 100644 --- a/src/main.cc +++ b/src/main.cc @@ -53,10 +53,8 @@ main(int argc, char *argv[]) } entrypoints[] = { { NULL }, { NULL }, - { &convert_tool_main }, { &generate_tool_main }, { &effectuate_tool_main }, - { &permute_tool_main }, { NULL } }; diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index 9805e6b..c5b1e57 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -4,6 +4,7 @@ #include "error.h" #include "matrix.h" #include "operation.h" +#include "thread.h" #include "tensor.h" #include "utility.h" #include "vector.h" @@ -11,8 +12,11 @@ #include extern cache_t *cache; +extern uint memory_stride; extern uint thread_count; +static pthread_mutex_t tube_lock; + /* Computing ($pT$): Let $\T \in R^{n\times n\times n}$ be a tensor. @@ -27,394 +31,119 @@ extern uint thread_count; end for */ -void -compressed_row(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) -{ - uint i, j, k; - uint rn, nnz; - uint start, end; - uint c, r, r0, t, m, n; - double **M; - double const *V; - uint const *p, *R, *C, *T; - tensor_storage_compressed_t const *storage; - - debug("compressed_row(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); - - p = vector->data; - M = matrix->data; - V = tensor->values; - nnz = tensor->nnz; - m = matrix->m; - n = matrix->n; - - storage = STORAGE_COMPRESSED(tensor); - rn = storage->rn; - R = storage->RO; - C = storage->CO; - T = storage->KO; - - /* - Using \emph{compressed row storage} ($\CRS$), this tensor can be - represented as: - - $k$ 0 1 2 3 4 5 6 7 8 9 10 11 - $\rowcrs$ & 0 & 4 & 8 & 12 - $\colcrs$ & 1 & 3 & 0 & 2 & 0 & 2 & 1 & 2 & 1 & 2 & 0 & 3 - $\tubecrs$ & 0 & 0 & 1 & 1 & 0 & 0 & 1 & 1 & 0 & 0 & 1 & 1 - $\valcrs$ & 1 & 2 & 7 & 8 & 3 & 4 & 9 & 10 & 5 & 6 & 11 & 12 - */ - - DEBUG("\n"); - - for (r = 1; r < rn; ++r) { - r0 = r-1; - i = r0 % n; - start = R[r0]; - end = R[r]; - - CACHE_ACCESS(cache, &R[r0], cache_operation::read, "R[r=%d]", r0); - CACHE_ACCESS(cache, &R[r], cache_operation::read, "R[r=%d]", r); - - DEBUG("start=%d, end=%d\n", start, end); - - for (k = start; k < end; ++k) { - - c = C[k]; - j = c; - t = T[k]; - - DEBUG("i=%d, j=%d, t=%d, r=%d, c=%d, k=%d\n", i, j, t, r, c, k); - - CACHE_ACCESS(cache, &C[k], cache_operation::read, "C[k=%d]", k); - CACHE_ACCESS(cache, &T[k], cache_operation::read, "T[k=%d]", k); - - // trace("(M[i=%2d][j=%2d]=%2.0f += (p[t=%2d]=%2d * V[k=%2d]=%2.0f)=%2.0f))=%2.0f\n", i, j, M[i][j], t, p[t], k, V[k], p[t] * V[k], M[i][j] + p[t] * V[k]); - - M[i][j] += p[t] * V[k]; - - CACHE_ACCESS(cache, &V[k], cache_operation::read, "V[k=%d]", k); - CACHE_ACCESS(cache, &p[t], cache_operation::read, "P[t=%d]", t); - CACHE_ACCESS(cache, &M[i][j], cache_operation::read, "M[i=%d][j=%d]", i, j); - CACHE_ACCESS(cache, &M[i][j], cache_operation::write, "M[i=%d][j=%d]", i, j); - - CACHE_DEBUG(cache); - } - } -} - -void -compressed_tube(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) -{ - uint i, j, k; - uint rn, nnz; - uint start, end; - uint c, r, r0, t, m, n; - double **M; - double const *V; - uint const *p, *R, *C, *T; - tensor_storage_compressed_t const *storage; - - debug("compressed_tube(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); - - p = vector->data; - M = matrix->data; - V = tensor->values; - nnz = tensor->nnz; - m = matrix->m; - n = matrix->n; - - storage = STORAGE_COMPRESSED(tensor); - rn = storage->rn; - R = storage->RO; - C = storage->CO; - T = storage->KO; - - /* - Using \emph{compressed row storage} ($\CRS$), this tensor can be - represented as: - - $k$ 0 1 2 3 4 5 6 7 8 9 10 11 - $\rowcrs$ & 0 & 4 & 8 & 12 - $\colcrs$ & 1 & 3 & 0 & 2 & 0 & 2 & 1 & 2 & 1 & 2 & 0 & 3 - $\tubecrs$ & 0 & 0 & 1 & 1 & 0 & 0 & 1 & 1 & 0 & 0 & 1 & 1 - $\valcrs$ & 1 & 2 & 7 & 8 & 3 & 4 & 9 & 10 & 5 & 6 & 11 & 12 - */ - - DEBUG("\n"); - - for (r = 1; r < rn; ++r) { - r0 = r-1; - i = r0 % n; - start = R[r0]; - end = R[r]; - - CACHE_ACCESS(cache, &R[r0], cache_operation::read, "R[r=%d]", r0); - CACHE_ACCESS(cache, &R[r], cache_operation::read, "R[r=%d]", r); - - DEBUG("start=%d, end=%d\n", start, end); - - for (k = start; k < end; ++k) { - c = C[k]; - t = T[k]; // row - j = t; - - DEBUG("i=%d, j=%d, t=%d, r=%d, c=%d, k=%d\n", i, j, t, r, c, k); - - CACHE_ACCESS(cache, &C[k], cache_operation::read, "C[k=%d]", k); - CACHE_ACCESS(cache, &T[k], cache_operation::read, "T[k=%d]", k); - - // trace("(M[i=%2d][j=%2d]=%2.0f += (p[c=%2d]=%2d * V[k=%2d]=%2.0f)=%2.0f))=%2.0f\n", i, j, M[i][j], c, p[c], k, V[k], p[c] * V[k], M[i][j] + p[c] * V[k]); - - M[i][j] += p[c] * V[k]; - - CACHE_ACCESS(cache, &V[k], cache_operation::read, "V[k=%d]", k); - CACHE_ACCESS(cache, &p[c], cache_operation::read, "P[c=%d]", c); - CACHE_ACCESS(cache, &M[i][j], cache_operation::read, "M[i=%d][j=%d]", i, j); - CACHE_ACCESS(cache, &M[i][j], cache_operation::write, "M[i=%d][j=%d]", i, j); - - CACHE_DEBUG(cache); - } - } -} +typedef struct { + uint done; + matrix_t *matrix; + vector_t const *vector; + tensor_t const *tensor; +} product_thread_data_t; -void -n_mode_product_compressed(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +int +traditional_next_tube(product_thread_data_t *data) { - debug("n_mode_product_compressed(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + uint k; - switch (tensor->orientation) { - case orientation::row: - compressed_row(matrix, vector, tensor); - break; - case orientation::tube: - compressed_tube(matrix, vector, tensor); - break; - default: - die("Tensor product for '%s' orientation is not currently supported.\n", - orientation_to_string(tensor->orientation)); - break; - } + thread_mutex_lock(&tube_lock); + k = data->done++; + thread_mutex_unlock(&tube_lock); + return k < (data->tensor->n*data->tensor->n) ? k : -1; } -typedef void (*index_convert_t)(uint rr, uint kk, uint n, uint *i, uint *j, uint *t); - -void -converter_for_lateral(uint rr, uint kk, uint n, uint *i, uint *j, uint *t) +thread_address_t +traditional_fiber_product(thread_argument_t *argument) { - *i = kk / n; - *j = rr; - *t = kk % n; -} - -void -converter_for_horizontal(uint rr, uint kk, uint n, uint *i, uint *j, uint *t) -{ - *i = rr; - *j = kk / n; - *t = kk % n; -} - -void -converter_for_frontal(uint rr, uint kk, uint n, uint *i, uint *j, uint *t) -{ - *i = kk / n; - *j = kk % n; - *t = rr; -} - -void -compressed_slice(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor, index_convert_t converter) -{ - uint i, j, k, kk; - uint rn, nnz; - uint start, end; - uint r, rr, r0, t, m, n; - double **M; - double const *V; - uint const *p, *R, *K; - tensor_storage_compressed_t const *storage; - - debug("compressed_slice(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); - - p = vector->data; - M = matrix->data; - V = tensor->values; - nnz = tensor->nnz; - m = matrix->m; - n = matrix->n; - - storage = STORAGE_COMPRESSED(tensor); - rn = storage->rn; - R = storage->RO; - K = storage->KO; - - /* - Using \emph{compressed row storage} ($\CRS$), this tensor can be - represented as: - - $k$ 0 1 2 3 4 5 6 7 8 9 10 11 - $\rowcrs$ & 0 & 4 & 8 & 12 - $\colcrs$ & 1 & 3 & 0 & 2 & 0 & 2 & 1 & 2 & 1 & 2 & 0 & 3 - $\tubecrs$ & 0 & 0 & 1 & 1 & 0 & 0 & 1 & 1 & 0 & 0 & 1 & 1 - $\valcrs$ & 1 & 2 & 7 & 8 & 3 & 4 & 9 & 10 & 5 & 6 & 11 & 12 - */ - - DEBUG("\n"); - - for (r = 1; r < rn; ++r) { - r0 = r-1; - rr = r0; - start = R[r0]; - end = R[r]; - - CACHE_ACCESS(cache, &R[r0], cache_operation::read, "R[r=%d]", r0); - CACHE_ACCESS(cache, &R[r], cache_operation::read, "R[r=%d]", r); - - DEBUG("start=%d, end=%d\n", start, end); - - for (k = start; k < end; ++k) { - kk = K[k]; - - converter(rr, kk, n, &i, &j, &t); - DEBUG("i=%d, j=%d, t=%d, r=%d, k=%d\n", i, j, t, r, k); - - CACHE_ACCESS(cache, &K[k], cache_operation::read, "K[k=%d]", k); - - // trace("(M[i=%2d][j=%2d]=%2.0f += (p[t=%2d]=%2d * V[k=%2d]=%2.0f)=%2.0f))=%2.0f\n", i, j, M[i][j], t, p[t], k, V[k], p[t] * V[k], M[i][j] + p[t] * V[k]); - - M[i][j] += p[t] * V[k]; - - CACHE_ACCESS(cache, &V[k], cache_operation::read, "V[k=%d]", k); - CACHE_ACCESS(cache, &p[t], cache_operation::read, "P[t=%d]", t); - CACHE_ACCESS(cache, &M[i][j], cache_operation::read, "M[i=%d][j=%d]", i, j); - CACHE_ACCESS(cache, &M[i][j], cache_operation::write, "M[i=%d][j=%d]", i, j); - - CACHE_DEBUG(cache); + int t; + uint i, j, k, offset; + uint n, sum; + uint *P; + double **M, *T; + product_thread_data_t *data; + + data = (product_thread_data_t*) thread_data(argument); + + n = data->tensor->n; + M = data->matrix->data; + P = data->vector->data; + T = data->tensor->values; + + while (-1 != (t = traditional_next_tube(data))) { + sum = 0; + offset = t*n; + i = t/n; + j = t%n; + for (k = 0; k < n; ++k) { + sum += P[k] * T[offset+k]; } + M[i][j] = sum; } + + return NULL; } void -n_mode_product_compressed_slice(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { - index_convert_t converter; + product_thread_data_t data; - debug("n_mode_product_compressed_slice(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + memory_stride = memory_stride > tensor->n ? tensor->n : memory_stride; + thread_count = thread_count > tensor->n ? tensor->n : thread_count; - converter = NULL; + debug("threaded_n_mode_product_array: memory_stride=%d\n", memory_stride); + debug("threaded_n_mode_product_array: thread_count=%d\n", thread_count); - switch (tensor->orientation) { - case orientation::horizontal: - converter = &converter_for_horizontal; - break; - case orientation::lateral: - converter = &converter_for_lateral; - break; - case orientation::frontal: - converter = &converter_for_frontal; - break; - default: - die("Tensor product for '%s' orientation is not currently supported.\n", - orientation_to_string(tensor->orientation)); - break; - } + data.done = 0; + data.matrix = matrix; + data.vector = vector; + data.tensor = tensor; - compressed_slice(matrix, vector, tensor, converter); + thread_mutex_init(&tube_lock); + thread_fork(thread_count, traditional_fiber_product, &data, NULL); + thread_mutex_destroy(&tube_lock); } - + void -ekmr_row(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +serial_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { - uint i, j, k; - uint rn, nnz; - uint start, end; - uint c, ck, r, r0, t, m, n; - double **M; - double const *V; - uint const *p, *R, *CK; - tensor_storage_extended_t const *storage; - - debug("ekmr_row(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); - - p = vector->data; - M = matrix->data; - V = tensor->values; - nnz = tensor->nnz; - m = matrix->m; - n = matrix->n; - - storage = STORAGE_EXTENDED(tensor); - rn = storage->rn; - R = storage->RO; - CK = storage->CK; - - /* - Now, using \emph{extended compressed row storage} ($\ECRS$), the - original tensor can be represented as: - - $k$ 0 1 2 3 4 5 6 7 8 9 10 11 - $\rowcrs$ & 0 & 4 & 8 & 12 - $\ctcrs$ & 1 & 2 & 5 & 6 & 0 & 3 & 4 & 5 & 1 & 2 & 4 & 7 - $\valcrs$ & 7 & 1 & 8 & 2 & 3 & 9 & 4 & 10 & 11 & 5 & 6 & 12 - */ - - DEBUG("\n"); - - for (r = 1; r < rn; ++r) { - r0 = r-1; - i = r0 % n; - start = R[r0]; - end = R[r]; - - CACHE_ACCESS(cache, &R[r0], cache_operation::read, "R[r=%d]", r0); - CACHE_ACCESS(cache, &R[r], cache_operation::read, "R[r=%d]", r); - - DEBUG("start=%d, end=%d\n", start, end); - - for (k = start; k < end; ++k) { - ck = CK[k]; - c = ck / n; - j = c; - t = ck % n; - - DEBUG("i=%d, j=%d, t=%d, r=%d, c=%d, k=%d\n", i, j, t, r, c, k); - - CACHE_ACCESS(cache, &CK[k], cache_operation::read, "CK[k=%d]", k); - - // trace("(M[i=%2d][j=%2d]=%2.0f += (p[t=%2d]=%2d * V[k=%2d]=%2.0f)=%2.0f))=%2.0f\n", i, j, M[i][j], t, p[t], k, V[k], p[t] * V[k], M[i][j] + p[t] * V[k]); - - M[i][j] += p[t] * V[k]; - - CACHE_ACCESS(cache, &V[k], cache_operation::read, "V[k=%d]", k); - CACHE_ACCESS(cache, &p[t], cache_operation::read, "P[t=%d]", t); - CACHE_ACCESS(cache, &M[i][j], cache_operation::read, "M[i=%d][j=%d]", i, j); - CACHE_ACCESS(cache, &M[i][j], cache_operation::write, "M[i=%d][j=%d]", i, j); - - CACHE_DEBUG(cache); + uint i, j, k, index; + uint n; + uint *P; + double **M, *T; + + debug("n_mode_product_array(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + + n = tensor->n; + M = matrix->data; + P = vector->data; + T = tensor->values; + + for (i = 0; i < n; ++i) { + for (j = 0; j < n; ++j) { + for (k = 0; k < n; ++k) { + index = tensor_index(tensor, i, j, k); + M[i][j] += P[k] * T[index]; + } } } } void -n_mode_product_ekmr(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { - debug("n_mode_product_ekmr(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + debug("threaded_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + + compatible(vector, tensor); - switch (tensor->orientation) { - case orientation::row: - ekmr_row(matrix, vector, tensor); + switch (tensor->strategy) { + case strategy::array: + threaded_n_mode_product_array(matrix, vector, tensor); break; default: - die("Tensor product for '%s' orientation is not currently supported.\n", - orientation_to_string(tensor->orientation)); + die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n", + strategy_to_string(tensor->strategy)); break; } } -extern void -n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor); - void serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { @@ -424,30 +153,15 @@ serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const * switch (tensor->strategy) { case strategy::array: - n_mode_product_array(matrix, vector, tensor); - break; - case strategy::compressed: - n_mode_product_compressed(matrix, vector, tensor); - break; - case strategy::slice: - n_mode_product_compressed_slice(matrix, vector, tensor); - break; - case strategy::ekmr: - case strategy::zzekmr: /* NOTE: the encoding may differ, but the - way we calculate products remains the - same. How is that for simplicity? */ - n_mode_product_ekmr(matrix, vector, tensor); + serial_n_mode_product_array(matrix, vector, tensor); break; default: - die("Tensor product for '%s' strategy is not currently supported.\n", + die("serial_n_mode_product: tensor product for '%s' strategy is not currently supported.\n", strategy_to_string(tensor->strategy)); break; } } -extern void -threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor); - void operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { diff --git a/src/operation_threaded_n_mode_product.cc b/src/operation_threaded_n_mode_product.cc deleted file mode 100644 index 181696f..0000000 --- a/src/operation_threaded_n_mode_product.cc +++ /dev/null @@ -1,177 +0,0 @@ - -#include "cache.h" -#include "compatible.h" -#include "error.h" -#include "matrix.h" -#include "operation.h" -#include "tensor.h" -#include "thread.h" -#include "utility.h" -#include "vector.h" -#include -#include - -extern cache_t *cache; -extern uint memory_stride; -extern uint thread_count; - -/* - Computing ($pT$): - Let $\T \in R^{n\times n\times n}$ be a tensor. - Let $\M \in R^{n\times n}$ be a matrix. - Let $p \in R^{n}$ be a vector. - for i = 1 to l do - for j = 1 to m do - for k = 1 to m do - M[i][j] += p[k] * T[i][j][k] - end for - end for - end for -*/ - -typedef struct { - uint done; - matrix_t *matrix; - vector_t const *vector; - tensor_t const *tensor; -} product_thread_data_t; - -static pthread_mutex_t tube_lock; - -int -serial_next_tube(product_thread_data_t *data) -{ - uint k; - - thread_mutex_lock(&tube_lock); - k = data->done++; - thread_mutex_unlock(&tube_lock); - return k < (data->tensor->n*data->tensor->n) ? k : -1; -} - -thread_address_t -serial_fiber_product(thread_argument_t *argument) -{ - int t; - uint i, j, k, offset; - uint n, sum; - uint *P; - double **M, *T; - product_thread_data_t *data; - - data = (product_thread_data_t*) thread_data(argument); - - n = data->tensor->n; - M = data->matrix->data; - P = data->vector->data; - T = data->tensor->values; - - while (-1 != (t = serial_next_tube(data))) { - sum = 0; - offset = t*n; - i = t/n; - j = t%n; - for (k = 0; k < n; ++k) { - sum += P[k] * T[offset+k]; - } - M[i][j] = sum; - } - - return NULL; -} - -void -threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) -{ - product_thread_data_t data; - - memory_stride = memory_stride > tensor->n ? tensor->n : memory_stride; - thread_count = thread_count > tensor->n ? tensor->n : thread_count; - - debug("threaded_n_mode_product_array: memory_stride=%d\n", memory_stride); - debug("threaded_n_mode_product_array: thread_count=%d\n", thread_count); - - data.done = 0; - data.matrix = matrix; - data.vector = vector; - data.tensor = tensor; - - thread_mutex_init(&tube_lock); - thread_fork(thread_count, serial_fiber_product, &data, NULL); - thread_mutex_destroy(&tube_lock); -} - -#if 0 -void -threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) -{ - uint i; - pthread_t threads[32]; - int error; - product_thread_data_t data; - - data.done = 0; - data.matrix = matrix; - data.vector = vector; - data.tensor = tensor; - - pthread_mutex_init(&tube_lock, NULL); - - for (i = 0; i < thread_count; ++i) { - if (0 != (error = pthread_create(&threads[i], NULL, (void* (*)(void*))serial_fiber_product, &data))) { - die("threaded_n_mode_product_array: pthread_create() failed with %d\n", error); - } - } - - for (i = 0; i < thread_count; ++i) { - if (0 != (error = pthread_join(threads[i], NULL))) { - die("threaded_n_mode_product_array: pthread_join() failed with %d\n", error); - } - } - - pthread_mutex_destroy(&tube_lock); -} -#endif - -void -n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) -{ - uint i, j, k, index; - uint n; - uint *P; - double **M, *T; - - debug("n_mode_product_array(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); - - n = tensor->n; - M = matrix->data; - P = vector->data; - T = tensor->values; - - for (i = 0; i < n; ++i) { - for (j = 0; j < n; ++j) { - for (k = 0; k < n; ++k) { - index = tensor_index(tensor, i, j, k); - M[i][j] += P[k] * T[index]; - } - } - } -} - -void -threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) -{ - debug("threaded_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); - - compatible(vector, tensor); - - switch (tensor->strategy) { - case strategy::array: - threaded_n_mode_product_array(matrix, vector, tensor); - break; - default: - die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n", - strategy_to_string(tensor->strategy)); - break; - } -} diff --git a/src/tensor.h b/src/tensor.h index 378eb54..57af6d8 100644 --- a/src/tensor.h +++ b/src/tensor.h @@ -7,14 +7,6 @@ #include "vector.h" #include -namespace permutation_heuristic { - typedef enum { - none, - naive_minimum, - naive_median - } type_t; -} - namespace file_format { typedef enum { unknown, @@ -49,11 +41,10 @@ namespace orientation { } typedef struct { - uint l, m, n, nnz; + uint l, m, n; strategy::type_t strategy; orientation::type_t orientation; ownership::type_t owner; - void *storage; double *values; } tensor_t; @@ -62,53 +53,8 @@ typedef struct { uint index; } coordinate_tuple_t; -typedef int (*index_compare_t)(const void *a, const void *b); -typedef uint (*index_encoder_t)(coordinate_tuple_t const *tuple); -typedef void (*index_copy_t)(void *destination, void const *source, uint i); - -typedef struct { - index_compare_t index_compare; - index_encoder_t index_r_encoder; - index_encoder_t index_c_encoder; - index_copy_t index_copy; -} conversion_callbacks_t; - -typedef struct { - conversion_callbacks_t *callbacks; -} tensor_storage_base_t; - -typedef struct { - tensor_storage_base_t dummy; - coordinate_tuple_t *tuples; -} tensor_storage_coordinate_t; - -typedef struct { - tensor_storage_base_t base; - uint n, rn, cn, tn, kn; - uint *RO, *CO, *TO, *KO; -} tensor_storage_compressed_t; - -typedef struct { - tensor_storage_base_t base; - uint rn, ckn; - uint *RO, *CK; -} tensor_storage_extended_t; - -typedef struct { - tensor_storage_extended_t dummy; -} tensor_storage_ekmr_t; - -typedef struct { - tensor_storage_extended_t dummy; -} tensor_storage_zzekmr_t; - -#define STORAGE_BASE(x) ((tensor_storage_base_t*)x->storage) -#define STORAGE_COORIDINATE(x) ((tensor_storage_coordinate_t*)x->storage) -#define STORAGE_COMPRESSED(x) ((tensor_storage_compressed_t*)x->storage) -#define STORAGE_EXTENDED(x) ((tensor_storage_extended_t*)x->storage) - tensor_t* tensor_malloc(uint l, uint m, uint n, ownership::type_t owner = ownership::creator); -tensor_t* tensor_malloc(uint l, uint m, uint n, uint nnz, strategy::type_t strategy, +tensor_t* tensor_malloc(uint l, uint m, uint n, strategy::type_t strategy, orientation::type_t orientation = orientation::unknown, ownership::type_t owner = ownership::creator); tensor_t* tensor_malloc_from_template(tensor_t const *tensor); @@ -121,10 +67,10 @@ void tensor_transfer_ownership(tensor_t *destination, tensor_t *source); void tensor_clear(tensor_t *tensor); +#if 0 tensor_t *tensor_convert(tensor_t *tensor, strategy::type_t strategy, orientation::type_t orientation = orientation::unknown); void tensor_convert(tensor_t *destination, tensor_t *source); - -tensor_t* tensor_permute(tensor_t *tensor, permutation_heuristic::type_t heuristic); +#endif tensor_t *tensor_read(char const *filename); tensor_t *tensor_fread(FILE *file); @@ -139,18 +85,14 @@ void tensor_validate(tensor_t const *tensor); char const* strategy_to_string(strategy::type_t strategy); char const* orientation_to_string(orientation::type_t orientation); char const* orientation_to_latex_macro(orientation::type_t orientation); -char const* permutation_heuristic_to_string(permutation_heuristic::type_t heuristic); strategy::type_t string_to_strategy(char const *name); orientation::type_t string_to_orientation(char const *name); -permutation_heuristic::type_t string_to_permutation_heuristic(char const *name); strategy::type_t typecode_to_strategy(MM_typecode type); void strategy_to_typecode(MM_typecode *type, strategy::type_t strategy); void print_strategies(char const *format); void print_orientations(char const *format); void print_operations(char const *format); void print_operations_with_descriptions(char const *format); -void print_permutation_heuristics(char const *format); -void print_permutation_heuristics_with_descriptions(char const *format); #if 0 void tensor_add(tensor_t *t1, tensor_t const *t2); diff --git a/src/tensor_clear.cc b/src/tensor_clear.cc index fc23665..e76d23b 100644 --- a/src/tensor_clear.cc +++ b/src/tensor_clear.cc @@ -2,90 +2,18 @@ #include "error.h" #include "tensor.h" -void -tensor_storage_clear_coordinate(tensor_t *tensor) -{ - uint i; - tensor_storage_coordinate_t *storage; - coordinate_tuple_t *tuples; - - debug("tensor_storage_clear_coordinate(0x%x)\n", tensor); - - storage = STORAGE_COORIDINATE(tensor); - tuples = storage->tuples; - - for (i = 0; i < tensor->nnz; ++i) { - tuples[i].i = 0; - tuples[i].j = 0; - tuples[i].k = 0; - tuples[i].index = 0; - } -} - -void -tensor_storage_clear_compressed(tensor_t *tensor) -{ - uint i; - tensor_storage_compressed_t *storage; - - debug("tensor_storage_clear_compressed(0x%x)\n", tensor); - - storage = STORAGE_COMPRESSED(tensor); - - for (i = 0; i < storage->rn; ++i) { - storage->RO[i] = 0; - } - - for (i = 0; i < tensor->nnz; ++i) { - storage->CO[i] = 0; - storage->KO[i] = 0; - } -} - -void -tensor_storage_clear_extended(tensor_t *tensor) -{ - uint i; - tensor_storage_extended_t *storage; - - debug("tensor_storage_clear_ekmr(0x%x)\n", tensor); - - storage = STORAGE_EXTENDED(tensor); - - for (i = 0; i < storage->rn; ++i) { - storage->RO[i] = 0; - } - - for (i = 0; i < tensor->nnz; ++i) { - storage->CK[i] = 0; - } -} - void tensor_clear(tensor_t *tensor) { - uint i; + uint i, n; debug("tensor_clear(0x%x)\n", tensor); tensor_validate(tensor); - for (i = 0; i < tensor->nnz; ++i) { - tensor->values[i] = 0.0; - } + n = tensor->l*tensor->m*tensor->n; - switch (tensor->strategy) { - case strategy::coordinate: - tensor_storage_clear_coordinate(tensor); - break; - case strategy::compressed: - tensor_storage_clear_compressed(tensor); - break; - case strategy::ekmr: - case strategy::zzekmr: - tensor_storage_clear_extended(tensor); - break; - default: - die("Tensor storage strategy '%d' is not supported.\n", tensor->strategy); + for (i = 0; i < n; ++i) { + tensor->values[i] = 0.0; } } diff --git a/src/tensor_copy.cc b/src/tensor_copy.cc index 267ec71..1d2c9eb 100644 --- a/src/tensor_copy.cc +++ b/src/tensor_copy.cc @@ -9,7 +9,6 @@ tensor_copy_shallow(tensor_t *destination, tensor_t *source) destination->owner = ownership::viewer; destination->values = source->values; - destination->storage = source->storage; } tensor_t* @@ -19,7 +18,7 @@ tensor_copy_shallow(tensor_t *source) debug("tensor_copy_shallow(source=0x%x)\n", source); - destination = tensor_malloc(source->l, source->m, source->n, source->nnz, source->strategy, source->orientation, source->owner); + destination = tensor_malloc(source->l, source->m, source->n, source->strategy, source->orientation, source->owner); tensor_copy_shallow(destination, source); return destination; diff --git a/src/tensor_emit_latex.cc b/src/tensor_emit_latex.cc index d3ae185..3150066 100644 --- a/src/tensor_emit_latex.cc +++ b/src/tensor_emit_latex.cc @@ -46,47 +46,7 @@ print_footer(FILE *file) fprintf(file, "\\end{tabular}\n"); } -void -tensor_fwrite_compressed_latex(FILE *file, tensor_t const *tensor) -{ - uint l, m, n; - int nnz; - tensor_storage_compressed_t *storage; - char const *name, *macro; - - debug("tensor_fwrite_compressed_latex(file=0x%x, tensor=0x%x)\n", file, tensor); - - storage = STORAGE_COMPRESSED(tensor); - l = tensor->l; - m = tensor->m; - n = tensor->n; - nnz = tensor->nnz; - name = orientation_to_string(tensor->orientation); - macro = orientation_to_latex_macro(tensor->orientation); - - debug("tensor_fwrite_compressed_latex: l=%d, m=%d, n=%d, nnz=%d, orientation='%s', macro='%s'.\n", - l, m, n, nnz, name, macro); - - print_header(file, nnz); - print_hline(file, storage->rn); - fprintf(file, "$\\row_{\\%s}$ & ", macro); - for_each_fprintf(file, "%d%s", storage->RO, storage->rn, " & ", " \\\\\n"); - print_hline(file, storage->cn); - fprintf(file, "$\\col_{\\%s}$ & ", macro); - for_each_fprintf(file, "%d%s", storage->CO, storage->cn, " & ", " \\\\\n"); - print_hline(file, storage->tn); - fprintf(file, "$\\tube_{\\%s}$ & ", macro); - for_each_fprintf(file, "%d%s", storage->TO, storage->tn, " & ", " \\\\\n"); - print_hline(file, storage->kn); - fprintf(file, "$KO_{\\%s}$ & ", macro); - for_each_fprintf(file, "%d%s", storage->KO, storage->kn, " & ", " \\\\\n"); - print_hline(file, nnz); - fprintf(file, "$\\val_{\\%s}$ & ", macro); - for_each_fprintf(file, "%g%s", tensor->values, nnz, " & ", " \\\\\n"); - print_hline(file, nnz); - print_footer(file); -} - +#if 0 void tensor_fwrite_extended_compressed_latex(FILE *file, tensor_t const *tensor, strategy::type_t strategy) { @@ -121,25 +81,12 @@ tensor_fwrite_extended_compressed_latex(FILE *file, tensor_t const *tensor, stra print_hline(file, nnz); print_footer(file); } +#endif void tensor_emit_latex(FILE *file, tensor_t const *tensor) { debug("tensor_emit_latex(file=0x%x, tensor=0x%x)\n", file, tensor); debug("tensor_emit_latex: strategy='%s'\n", strategy_to_string(tensor->strategy)); - - switch (tensor->strategy) { - case strategy::compressed: - case strategy::slice: - tensor_fwrite_compressed_latex(file, tensor); - break; - case strategy::ekmr: - case strategy::zzekmr: - tensor_fwrite_extended_compressed_latex(file, tensor, tensor->strategy); - break; - default: - die("Emitting LaTeX source for storage strategy '%d' is not supported.\n", - strategy_to_string(tensor->strategy)); - } } diff --git a/src/tensor_free.cc b/src/tensor_free.cc index 58cdd70..696a2ad 100644 --- a/src/tensor_free.cc +++ b/src/tensor_free.cc @@ -6,73 +6,6 @@ #include #include -void -tensor_storage_free(tensor_storage_base_t *storage) -{ - superfluous("tensor_storage_free((tensor_storage_base_t*)0x%x)\n", storage); - - safe_free(storage->callbacks); -} - -void -tensor_storage_free(tensor_storage_coordinate_t *storage) -{ - superfluous("tensor_storage_free((tensor_storage_coordinate_t*)0x%x)\n", storage); - - safe_free(storage->tuples); -} - -void -tensor_storage_free(tensor_storage_compressed_t *storage) -{ - superfluous("tensor_storage_free((tensor_storage_compressed_t*)0x%x)\n", storage); - - safe_free(storage->RO); - safe_free(storage->CO); - safe_free(storage->TO); - safe_free(storage->KO); -} - -void -tensor_storage_free(tensor_storage_extended_t *storage) -{ - superfluous("tensor_storage_free((tensor_storage_extended_t*)0x%x)\n", storage); - - safe_free(storage->RO); - safe_free(storage->CK); -} - -void -tensor_storage_free(tensor_t *tensor) -{ - superfluous("tensor_storage_free(0x%x)\n", tensor); - - if (!tensor->storage) { - return; - } - - tensor_storage_free(STORAGE_BASE(tensor)); - - switch (tensor->strategy) { - case strategy::coordinate: - tensor_storage_free(STORAGE_COORIDINATE(tensor)); - break; - case strategy::compressed: - case strategy::slice: - tensor_storage_free(STORAGE_COMPRESSED(tensor)); - break; - case strategy::ekmr: - case strategy::zzekmr: - tensor_storage_free(STORAGE_EXTENDED(tensor)); - break; - default: - die("Tensor storage strategy '%d' is not supported.\n", - strategy_to_string(tensor->strategy)); - } - - safe_free(tensor->storage); -} - void tensor_free(tensor_t *tensor) { @@ -84,7 +17,6 @@ tensor_free(tensor_t *tensor) if (ownership::creator == tensor->owner) { safe_free(tensor->values); - tensor_storage_free(tensor); } safe_free(tensor); diff --git a/src/tensor_malloc.cc b/src/tensor_malloc.cc index ddce7ce..42f5be1 100644 --- a/src/tensor_malloc.cc +++ b/src/tensor_malloc.cc @@ -2,7 +2,6 @@ #include "error.h" #include "memory.h" #include "mmio.h" -#include "storage.h" #include "tensor.h" #include "utility.h" #include @@ -24,12 +23,10 @@ tensor_malloc(uint l, uint m, uint n, ownership::type_t owner) tensor->l = l; tensor->m = m; tensor->n = n; - tensor->nnz = 0; tensor->strategy = strategy::array; tensor->orientation = orientation::unknown; tensor->owner = owner; tensor->values = NULL; - tensor->storage = NULL; if (ownership::viewer == owner) { return tensor; @@ -44,35 +41,29 @@ tensor_malloc(uint l, uint m, uint n, ownership::type_t owner) } tensor_t* -tensor_malloc(uint l, uint m, uint n, uint nnz, strategy::type_t strategy, orientation::type_t orientation, ownership::type_t owner) +tensor_malloc(uint l, uint m, uint n, strategy::type_t strategy, orientation::type_t orientation, ownership::type_t owner) { tensor_t *tensor; - superfluous("tensor_malloc(l=%d, m=%d, n=%d, nnz=%d, strategy='%s', orientation='%s')\n", - l, m, n, nnz, strategy_to_string(strategy), orientation_to_string(orientation)); + superfluous("tensor_malloc(l=%d, m=%d, n=%d, strategy='%s', orientation='%s')\n", + l, m, n, strategy_to_string(strategy), orientation_to_string(orientation)); tensor = MALLOC(tensor_t); tensor->l = l; tensor->m = m; tensor->n = n; - tensor->nnz = nnz; tensor->strategy = strategy; tensor->orientation = orientation; tensor->owner = owner; tensor->values = NULL; - tensor->storage = NULL; if (ownership::viewer == owner) { return tensor; } - if (nnz > 0) { - tensor->values = MALLOC_N(double, nnz); - tensor->storage = tensor_storage_malloc(tensor); - } + tensor->values = MALLOC_N(double, l*m*n); superfluous("tensor_malloc: tensor->values=0x%x\n", tensor->values); - superfluous("tensor_malloc: tensor->storage=0x%x\n", tensor->storage); superfluous("tensor_malloc: tensor=0x%x\n", tensor); return tensor; @@ -83,5 +74,5 @@ tensor_malloc_from_template(tensor_t const *tensor) { superfluous("tensor_malloc_from_template(tensor=0x%x)\n", tensor); - return tensor_malloc(tensor->l, tensor->m, tensor->n, tensor->nnz, tensor->strategy, tensor->orientation, tensor->owner); + return tensor_malloc(tensor->l, tensor->m, tensor->n, tensor->strategy, tensor->orientation, tensor->owner); } diff --git a/src/tensor_read.cc b/src/tensor_read.cc index 0fcb404..775b946 100644 --- a/src/tensor_read.cc +++ b/src/tensor_read.cc @@ -43,6 +43,7 @@ tensor_fread_array(FILE *file) return tensor; } +#if 0 tensor_t* tensor_fread_coordinate(FILE *file) { @@ -213,6 +214,7 @@ tensor_fread_extended_compressed(FILE *file, strategy::type_t strategy) return tensor; } +#endif tensor_t* tensor_fread_mmio_data(FILE *file, MM_typecode type) @@ -237,6 +239,7 @@ tensor_fread_mmio_data(FILE *file, MM_typecode type) case strategy::array: tensor = tensor_fread_array(file); break; +#if 0 case strategy::coordinate: tensor = tensor_fread_coordinate(file); break; @@ -250,6 +253,7 @@ tensor_fread_mmio_data(FILE *file, MM_typecode type) case strategy::zzekmr: tensor = tensor_fread_extended_compressed(file, strategy); break; +#endif default: die("Tensor storage strategy '%d' is not supported.\n", strategy); } @@ -270,6 +274,7 @@ tensor_fread_mmio(FILE *file) return tensor_fread_mmio_data(file, type); } +#if 0 tensor_t* tensor_fread_matlab(FILE *file) { @@ -322,6 +327,7 @@ tensor_fread_matlab(FILE *file) return tensor; } +#endif file_format::type_t detect_file_format(FILE *file) @@ -357,9 +363,11 @@ tensor_fread_file_format(FILE *file, file_format::type_t format) case file_format::mmio: tensor = tensor_fread_mmio(file); break; +#if 0 case file_format::matlab: tensor = tensor_fread_matlab(file); break; +#endif default: die("tensor_fread_file_format: unknown file type %d.\n", format); break; diff --git a/src/tensor_utility.cc b/src/tensor_utility.cc index 9700df2..dbd1ff1 100644 --- a/src/tensor_utility.cc +++ b/src/tensor_utility.cc @@ -4,59 +4,6 @@ #include "utility.h" #include -static char const *map_permutation_heuristics_to_string[] = { - "none", - "naive-minimum", - "naive-median" -}; - -static char const *map_permutation_heuristics_to_description[] = { - "none", - "re-order tensor layout based on minimum intra-slice proximity", - "re-order tensor layout based on median intra-slice proximity" -}; - -char const* -permutation_heuristic_to_string(permutation_heuristic::type_t heuristic) -{ - return map_permutation_heuristics_to_string[heuristic]; -} - -permutation_heuristic::type_t -string_to_permutation_heuristic(char const *name) -{ - uint i; - - for (i = 0; i < COUNT_OF(map_permutation_heuristics_to_string); ++i) { - if (0 == strcmp(name, map_permutation_heuristics_to_string[i])) { - return (permutation_heuristic::type_t) i; - } - } - - return permutation_heuristic::none; -} - -void -print_permutation_heuristics(char const *format) -{ - uint i; - - for (i = 1; i < COUNT_OF(map_permutation_heuristics_to_string); ++i) { - message(format, map_permutation_heuristics_to_string[i]); - } -} - -void -print_permutation_heuristics_with_descriptions(char const *format) -{ - uint i; - - for (i = 1; i < COUNT_OF(map_permutation_heuristics_to_string); ++i) { - message(format, map_permutation_heuristics_to_string[i], - map_permutation_heuristics_to_description[i]); - } -} - static char const *map_strategy_to_string[] = { "unknown", "array", diff --git a/src/tensor_validate.cc b/src/tensor_validate.cc index 32808ed..96f47ea 100644 --- a/src/tensor_validate.cc +++ b/src/tensor_validate.cc @@ -12,8 +12,4 @@ tensor_validate(tensor_t const *tensor) if (!tensor->values) { die("Tensor values have not been allocated.\n"); } - - if (!tensor->storage) { - die("Tensor indexing strategy has not been allocated.\n"); - } } diff --git a/src/tensor_write.cc b/src/tensor_write.cc index 8f69149..67700e1 100644 --- a/src/tensor_write.cc +++ b/src/tensor_write.cc @@ -51,6 +51,7 @@ tensor_fwrite_array(FILE *file, tensor_t const *tensor) } } +#if 0 void tensor_fwrite_coordinate(FILE *file, tensor_t const *tensor) { @@ -230,6 +231,7 @@ tensor_fwrite_extended_compressed(FILE *file, tensor_t const *tensor) fprintf(file, "%d %10.6g\n", storage->CK[i], tensor->values[i]); } } +#endif void tensor_fwrite_implementation(FILE *file, tensor_t const *tensor) @@ -241,6 +243,7 @@ tensor_fwrite_implementation(FILE *file, tensor_t const *tensor) case strategy::array: tensor_fwrite_array(file, tensor); break; +#if 0 case strategy::coordinate: tensor_fwrite_coordinate(file, tensor); break; @@ -254,6 +257,7 @@ tensor_fwrite_implementation(FILE *file, tensor_t const *tensor) case strategy::zzekmr: tensor_fwrite_extended_compressed(file, tensor); break; +#endif default: die("Tensor storage strategy '%d' is not supported.\n", strategy_to_string(tensor->strategy)); diff --git a/src/tool.h b/src/tool.h index f687070..a40c1cf 100644 --- a/src/tool.h +++ b/src/tool.h @@ -12,10 +12,8 @@ namespace tool { typedef enum { unknown, tensor, - convert, generate, - effectuate, - permute + effectuate } type_t; } @@ -41,10 +39,8 @@ namespace tool { #define DEFAULT_CACHE_SIZE (2*1024) #define DEFAULT_CACHE_LINE_SIZE 32 -void convert_tool_main(int argc, char *argv[]); void generate_tool_main(int argc, char *argv[]); void effectuate_tool_main(int argc, char *argv[]); -void permute_tool_main(int argc, char *argv[]); vector_t* timed_vector_read(char const *name); matrix_t* timed_matrix_read(char const *name); From 1b33c664052d319317af0f62f119ca81f660ea4b Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Wed, 9 Nov 2011 13:55:23 -0700 Subject: [PATCH 27/57] + Removed sparse tensor storage code --- src/format.h | 164 ---------- src/storage.h | 42 --- src/tensor_convert.cc | 33 -- src/tensor_permute.cc | 403 ------------------------- src/tensor_storage_compressed.cc | 123 -------- src/tensor_storage_compressed_slice.cc | 122 -------- src/tensor_storage_convert.cc | 230 -------------- src/tensor_storage_coordinate.cc | 134 -------- src/tensor_storage_ekmr.cc | 100 ------ src/tensor_storage_gundersen.cc | 136 --------- src/tensor_storage_malloc.cc | 44 --- src/tensor_storage_matrix_slice.cc | 117 ------- src/tensor_storage_utility.cc | 198 ------------ src/tensor_storage_zzekmr.cc | 140 --------- src/tool_convert.cc | 169 ----------- src/tool_permute.cc | 171 ----------- 16 files changed, 2326 deletions(-) delete mode 100644 src/format.h delete mode 100644 src/storage.h delete mode 100644 src/tensor_convert.cc delete mode 100644 src/tensor_permute.cc delete mode 100644 src/tensor_storage_compressed.cc delete mode 100644 src/tensor_storage_compressed_slice.cc delete mode 100644 src/tensor_storage_convert.cc delete mode 100644 src/tensor_storage_coordinate.cc delete mode 100644 src/tensor_storage_ekmr.cc delete mode 100644 src/tensor_storage_gundersen.cc delete mode 100644 src/tensor_storage_malloc.cc delete mode 100644 src/tensor_storage_matrix_slice.cc delete mode 100644 src/tensor_storage_utility.cc delete mode 100644 src/tensor_storage_zzekmr.cc delete mode 100644 src/tool_convert.cc delete mode 100644 src/tool_permute.cc diff --git a/src/format.h b/src/format.h deleted file mode 100644 index f499af8..0000000 --- a/src/format.h +++ /dev/null @@ -1,164 +0,0 @@ - -#pragma once -#ifndef _FORMAT_H_ -#define _FORMAT_H_ - -/*--------------------------------------------------------------------*/ - -#include "storage.h" -#include "tensor.h" -#include "utility.h" - -#include -#include -#include -#include - -/*--------------------------------------------------------------------*/ - -BEGIN_NAMESPACE(storage); - -/*--------------------------------------------------------------------*/ - -template -class tensor; - -/*--------------------------------------------------------------------*/ - -END_NAMESPACE(storage); - -/*--------------------------------------------------------------------*/ - -BEGIN_NAMESPACE(format); - -/*--------------------------------------------------------------------*/ - -const int precision = 32; - -/*--------------------------------------------------------------------*/ - -BEGIN_NAMESPACE(strategy); - -typedef enum { - coordinate = 0, - max = coordinate -} type; - -END_NAMESPACE(strategy); - -/*--------------------------------------------------------------------*/ - -template -class coordinate { - -public: - - std::istream& - read(std::istream& in, strategy::tensor& data) { - - T v; - int i, j, k; - int n, nnz; - - /* determine the data's dimensionality */ - in >> n >> n >> n >> nnz; - - /* make sure we will not have to realloc during the read */ - data.initialize(n, nnz); - - /* read the data */ - while (nnz--) { - in >> k >> i >> j >> v; - data.set(k, j, i, v); - } - - /* all done */ - return in; - - } - - std::ostream& - write(std::ostream& out, strategy::tensor const &data) const { - - T v; - int n, nnz; - - /* make sure we have some data */ - if (data.empty()) { - std::cerr << "ERROR: no data to print!" << std::endl; - return out; - } - - /* determine the data's dementionality */ - n = data.size(); - - /* determine the number of non-zero entries */ - nnz = 0; - for (int k = 0; k < n; ++k) { - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - if (0.0 != data.get(k, i, j)) { - nnz++; - } - } - } - } - - /* print the dimensions so we can optionally use the output as - input later */ - out << n << " " << n << " " << n << " " - << nnz << std::endl; - - /* set the output format */ - out << std::setprecision(precision) - << std::scientific; - - /* print the data */ - for (int k = 0; k < n; ++k) { - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - if (0.0 != (v = data.get(k, i, j))) { - out << k << " " << i << " " << j << " " - << v << std::endl; - } - } - } - } - - /* all done */ - return out; - - } - -}; - -/*--------------------------------------------------------------------*/ - -/* Compensate for the lack of templated typedefs. - Usage: - sextended_karnaugh_map M; - ** NOT WORKING because of template template parameters ** -*/ -#if 0 -template -struct acronyms { - typedef coordinate COO; - typedef compressed_sparse_row CSR; - typedef compressed_sparse_column CSC; - typedef block_sparse_row BSR; -}; -#endif - -/*--------------------------------------------------------------------*/ - -END_NAMESPACE(format); - -/*--------------------------------------------------------------------*/ - -#endif /* _FORMAT_H_ */ - -/* - Local Variables: - mode: C++ - End: -*/ diff --git a/src/storage.h b/src/storage.h deleted file mode 100644 index c402482..0000000 --- a/src/storage.h +++ /dev/null @@ -1,42 +0,0 @@ - -#ifndef _STORAGE_H_ -#define _STORAGE_H_ - -#include "tensor.h" - -void* tensor_storage_malloc(tensor_t const *tensor); -tensor_storage_coordinate_t* tensor_storage_malloc_coordinate(tensor_t const *tensor); -tensor_storage_compressed_t* tensor_storage_malloc_compressed(tensor_t const *tensor); -tensor_storage_compressed_t* tensor_storage_malloc_compressed_slice(tensor_t const *tensor); -tensor_storage_extended_t* tensor_storage_malloc_ekmr(tensor_t const *tensor); -tensor_storage_extended_t* tensor_storage_malloc_zzekmr(tensor_t const *tensor); - -void tensor_storage_convert(tensor_t *destination, tensor_t *source); -void tensor_storage_convert_from_coordinate_to_compressed(tensor_t *destination, tensor_t *source); -void tensor_storage_convert_from_coordinate_to_compressed_slice(tensor_t *destination, tensor_t *source); -void tensor_storage_convert_from_coordinate_to_gundersen(tensor_t *destination, tensor_t *source); -void tensor_storage_convert_from_coordinate_to_ekmr(tensor_t *destination, tensor_t *source); -void tensor_storage_convert_from_coordinate_to_zzekmr(tensor_t *destination, tensor_t *source); -void tensor_storage_convert_from_compressed_to_coordinate(tensor_t *destination, tensor_t *source); - -int index_compare_ijk(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb); -int index_compare_jik(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb); -int index_compare_jki(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb); -int index_compare_kji(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb); -int index_compare_kij(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb); -int index_compare_ikj(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb); - -uint tensor_storage_index_encode(uint *indices, uint n, coordinate_tuple_t const *tuple, uint nnz, index_encoder_t encoder); -uint encoder_for_i(coordinate_tuple_t const *tuple); -uint encoder_for_j(coordinate_tuple_t const *tuple); -uint encoder_for_k(coordinate_tuple_t const *tuple); - -void tensor_storage_copy(void *destination, void const *source, uint nnz, index_copy_t copier); -void copier_for_i(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i); -void copier_for_j(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i); -void copier_for_k(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i); -void copier_for_values(tensor_t *destination, tensor_t const *source, uint i); - - -#endif - diff --git a/src/tensor_convert.cc b/src/tensor_convert.cc deleted file mode 100644 index bf7294d..0000000 --- a/src/tensor_convert.cc +++ /dev/null @@ -1,33 +0,0 @@ - -#include "compatible.h" -#include "error.h" -#include "memory.h" -#include "mmio.h" -#include "storage.h" -#include "tensor.h" -#include "utility.h" -#include -#include - -void -tensor_convert(tensor_t *destination, tensor_t *source) -{ - debug("tensor_convert(destination=0x%x, source=0x%x)\n", destination, source); - - compatible(destination, source); - tensor_storage_convert(destination, source); -} - -tensor_t* -tensor_convert(tensor_t *tensor, strategy::type_t strategy, orientation::type_t orientation) -{ - tensor_t *result; - - debug("tensor_convert(tensor=0x%x, strategy='%s', orientation='%s')\n", - tensor, strategy_to_string(strategy), orientation_to_string(orientation)); - - result = tensor_malloc(tensor->l, tensor->m, tensor->n, tensor->nnz, strategy, orientation); - tensor_convert(result, tensor); - - return result; -} diff --git a/src/tensor_permute.cc b/src/tensor_permute.cc deleted file mode 100644 index c41370b..0000000 --- a/src/tensor_permute.cc +++ /dev/null @@ -1,403 +0,0 @@ - -#include "error.h" -#include "memory.h" -#include "matrix.h" -#include "tensor.h" -#include "utility.h" -#include "vector.h" -#include -#include -#include - -typedef uint (*slice_distance_t)(tensor_t *tensor, uint i, uint j); -typedef void (*slice_permutation_t)(vector_t *vector, tensor_t *tensor, slice_distance_t distance); - -uint -slice_distance(tensor_t *tensor, uint s1, uint s2) -{ - uint i, j, c1, c2, k1, k2, n; - uint distance; - uint const *R, *C, *K; - tensor_storage_compressed_t const *storage; - - //superfluous("slice_distance(vector=0x%x, s1=%d, s2=%d)\n", tensor, s1, s2); - - distance = 0; - storage = STORAGE_COMPRESSED(tensor); - n = storage->rn; - R = storage->RO; - C = storage->CO; - K = storage->KO; - - for (i = R[s1], j = R[s2]; i < R[s1+1] && j < R[s2+1];) { - c1 = K[i] / n; k1 = K[i] % n; - c2 = K[j] / n; k2 = K[j] % n; - if (c1 != c2 || k1 != k2) { - distance++; - if (c1 < c2 || k1 < k2) { - i++; - } else if (c2 < c1 || k2 < k1) { - j++; - } - } else { - i++; j++; - } - } - for (; i < R[s1+1]; ++i) { - distance++; - } - for (; j < R[s2+1]; ++j) { - distance++; - } - - //DEBUG("slice_distance: distance(%d, %d)=%d\n", s1, s2, distance); - - return distance; -} - - -void -naive_minimum_permutation(vector_t *vector, tensor_t *tensor, slice_distance_t distance) -{ - uint i, j, k, p; - uint n; - uint best; - matrix_t *matrix; - double **D; - uint *V; - bool *seen; - uint const *R, *C, *K; - tensor_storage_compressed_t const *storage; - - debug("naive_minimum_permutation(vector=0x%x, tensor=0x%x, distance=0x%x)\n", - vector, tensor, distance); - - storage = STORAGE_COMPRESSED(tensor); - R = storage->RO; - C = storage->CO; - K = storage->KO; - - n = tensor->n; - matrix = matrix_malloc(n, n); - D = matrix->data; - V = vector->data; - - matrix_clear(matrix); - - for (i = 0; i < n; ++i) { - D[i][i] = n*n+1; - } - for (j = 0; j < n; ++j) { - best = n*n+1; - for (i = j+1; i < n; ++i) { - if (i != j) { - D[i][j] = (*distance)(tensor, i, j); - D[j][i] = D[i][j]; - if (best > D[i][j]) { - best = D[i][j]; - V[0] = i; - V[1] = j; - DEBUG("permutation: best(%d, %d)=%d\n", i, j, best); - } - } - } - } - - DEBUG("permutation: best=%d, V[0]=%d, V[1]=%d\n", best, V[0], V[1]); - //matrix_fwrite(stdout, matrix, format::coordinate); - - seen = MALLOC_N(bool, n); - for (i = 0; i < n; ++i) { - seen[i] = false; - } - - seen[V[0]] = true; - seen[V[1]] = true; - - for (j = 2; j < n; ++j) { - best = n*n+1; - k = 0; - p = V[j-1]; - for (i = 0; i < n; ++i) { - if (!seen[i] && i != p) { - DEBUG("permutation: looking-at(%d, %d)=%lf\n", i, j, D[i][j]); - if (best > D[p][i]) { - best = D[p][i]; - k = i; - DEBUG("permutation: best(%d, %d)=%d\n", i, j, best); - } - } - } - V[j] = k; - seen[V[j]] = true; - DEBUG("permutation: best=%d, V[%d]=%d, V[%d]=%d\n", best, j, V[j-1], j, V[j]); - DEBUG("permutation: seen=%d\n", k); - } - - safe_free(seen); - -#if 0 - vector_fwrite(stdout, vector) - vector_fwrite(stdout, mean); -#endif -} - -/* - * The following code is public domain. - * Algorithm by Torben Mogensen, implementation by N. Devillard. - * This code in public domain. - * - * Source: http://ndevilla.free.fr/median/median/src/ - */ -uint -non_destructive_median(double m[], uint n, uint skip) -{ - uint i, less, greater, equal; - double min, max, guess, maxltguess, mingtguess; - min = max = m[0] ; - for (i=1 ; imax) max=m[i]; - } - } - while (1) { - guess = (min+max)/2; - less = 0; greater = 0; equal = 0; - maxltguess = min ; - mingtguess = max ; - for (i=0; imaxltguess) maxltguess = m[i] ; } - else if (m[i]>guess) { - greater++; - if (m[i]greater) max = maxltguess ; - else min = mingtguess; - } - if (less >= (n+1)/2) return maxltguess; - else if (less+equal >= (n+1)/2) return guess; - else return mingtguess; -} - -void -naive_median_permutation(vector_t *vector, tensor_t *tensor, slice_distance_t distance) -{ - uint i, j, k; - uint n; - uint best, difference; - vector_t *mean; - matrix_t *matrix; - double **D; - uint *V, *M; - bool *seen; - - debug("naive_median_permutation(vector=0x%x, tensor=0x%x, distance=0x%x)\n", - vector, tensor, distance); - - n = tensor->n; - matrix = matrix_malloc(n, n); - mean = vector_malloc(n); - D = matrix->data; - V = vector->data; - M = mean->data; - - matrix_clear(matrix); - - for (j = 0; j < n; ++j) { - best = n*n+1; - for (i = 0; i < n; ++i) { - if (i != j) { - D[i][j] = (*distance)(tensor, i, j); - if (best > D[i][j]) { - best = D[i][j]; - V[0] = i; - V[1] = j; - DEBUG("permutation: best(%d, %d)=%d\n", i, j, best); - } - } - } - } - - for (i = 0; i < n; ++i) { - M[i] = non_destructive_median(D[i], n, i); - } - - DEBUG("permutation: best=%d, V[0]=%d, V[1]=%d\n", best, V[0], V[1]); -#if 0 - matrix_fwrite(stdout, matrix, format::coordinate); -#endif - - seen = MALLOC_N(bool, n); - for (i = 0; i < n; ++i) { - seen[i] = false; - } - - seen[V[0]] = true; - seen[V[1]] = true; - - for (j = 2; j < n; ++j) { - best = n*n+1; - k = 0; - for (i = 0; i < n; ++i) { - if (!seen[i] && i != j) { - difference = fabs(D[i][j]-M[i]); - DEBUG("permutation: looking-at(%d, %d)=%lf (difference=%lf)\n", i, j, D[i][j], difference); - if (best > difference) { - best = difference; - k = i; - DEBUG("permutation: best(%d, %d)=%d\n", i, j, best); - } - } - } - V[j] = k; - seen[V[j]] = true; - DEBUG("permutation: best=%d, V[%d]=%d, V[%d]=%d\n", best, j, V[j-1], j, V[j]); - DEBUG("permutation: seen=%d\n", k); - } - - safe_free(seen); - safe_free(matrix); - -#if 0 - vector_fwrite(stdout, mean); - vector_fwrite(stdout, vector); -#endif -} - - - -tensor_t* -tensor_apply_permutation(tensor_t *source, vector_t *vector) -{ - uint i, i1, i2, r0, r; - uint n, rn, nnz, offset; - uint const *R1, *K1, *V; - uint *R2, *K2; - tensor_storage_compressed_t *storage; - tensor_t *destination; - double *V1, *V2; - - superfluous("tensor_apply_permutation(vector=0x%x, vector=%0x%x)\n", source, vector); - - V = vector->data; - - n = source->n; - nnz = source->nnz; - storage = STORAGE_COMPRESSED(source); - rn = storage->rn; - R1 = storage->RO; - K1 = storage->KO; - V1 = source->values; - - destination = tensor_malloc(n, n, n, nnz, strategy::slice, orientation::frontal); - storage = STORAGE_COMPRESSED(destination); - R2 = storage->RO; - K2 = storage->KO; - V2 = destination->values; - storage->rn = rn; - - offset = 0; - R2[0] = 0; - - for (i = 0; i < n; ++i) { - r0 = R1[V[i]]; - r = R1[V[i]+1]; - i2 = offset; - DEBUG("> r0=R1[V[i=%d] =%d]=%d\n", i, V[i], r0); - DEBUG("> r =R1[V[i=%d]+1=%d]=%d\n", i, V[i]+1, r); - DEBUG("> i2=%d\n", i2); - for (i1 = r0; i1 < r && i2 < nnz; ++i1, ++i2) { - K2[i2] = K1[i1]; - V2[i2] = V1[i1]; - DEBUG("K2[i2=%d]=%d; K1[i1=%d]=%d\n", i2, K2[i2], i1, K1[i1]); - DEBUG("V2[i2=%d]=%lf; V1[i1=%d]=%lf\n", i2, V2[i2], i1, V1[i1]); - } - offset += r - r0; - R2[i+1] = offset; - DEBUG("< R2[i+1=%d]=%d+%d-%d=%d\n", i+1, offset, r, r0, R2[i+1]); - } - - R2[i] = nnz; - -#if 0 - tensor_fwrite(stdout, destination); -#endif - - return destination; -} - - -void -permutation_supported(tensor_t *tensor) -{ - debug("permutation_supported(tensor=0x%x)\n", tensor); - - if (tensor->strategy != strategy::coordinate) { - die("permutation_supported: the tensor strategy '%s' is not supported.\n", - strategy_to_string(tensor->strategy)); - } -} - - -tensor_t* -tensor_permute(tensor_t *tensor, permutation_heuristic::type_t heuristic) -{ - vector_t *vector; - tensor_t *frontal, *permuted, *coordinate; - slice_distance_t distance; - slice_permutation_t permutation; - - debug("tensor_permute(tensor=0x%x, heuristic='%s')\n", - tensor, permutation_heuristic_to_string(heuristic)); - - permutation_supported(tensor); - - distance = &slice_distance; - vector = vector_malloc(tensor->n); - frontal = tensor_convert(tensor, strategy::slice, orientation::frontal); - -#if 0 - message("compressed frontal slice:\n"); - tensor_fwrite(stdout, frontal); -#endif - - switch (heuristic) { - case permutation_heuristic::naive_minimum: - permutation = &naive_minimum_permutation; - break; - case permutation_heuristic::naive_median: - permutation = &naive_median_permutation; - break; - default: - die("Heuristic '%d' is not supported.\n", heuristic); - break; - } - - (*permutation)(vector, frontal, distance); - permuted = tensor_apply_permutation(frontal, vector); - tensor_free(frontal); - vector_free(vector); - -#if 0 - message("compressed frontal slice (permuted):\n"); - tensor_fwrite(stdout, permuted); -#endif - - coordinate = tensor_convert(permuted, strategy::coordinate); - tensor_free(permuted); - -#if 0 - message("coordinate:\n"); - tensor_fwrite(stdout, coordinate); -#endif - - return coordinate; -} - diff --git a/src/tensor_storage_compressed.cc b/src/tensor_storage_compressed.cc deleted file mode 100644 index a26e1d1..0000000 --- a/src/tensor_storage_compressed.cc +++ /dev/null @@ -1,123 +0,0 @@ - -#include "error.h" -#include "memory.h" -#include "mmio.h" -#include "tensor.h" -#include "storage.h" -#include "utility.h" -#include -#include - -void -copier_for_row(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i) -{ - destination->CO[i] = source->tuples[i].j; - destination->KO[i] = source->tuples[i].k; -} - -void -copier_for_column(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i) -{ - destination->CO[i] = source->tuples[i].i; - destination->KO[i] = source->tuples[i].k; -} - -void -copier_for_tube(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i) -{ - destination->CO[i] = source->tuples[i].k; - destination->KO[i] = source->tuples[i].j; -} - -void -tensor_storage_convert_from_coordinate_to_compressed(tensor_t *destination, tensor_t *source) -{ - int n, nnz; - tensor_storage_base_t *base; - tensor_storage_compressed_t *d; - tensor_storage_coordinate_t *s; - coordinate_tuple_t *tuples; - double *values; - - s = STORAGE_COORIDINATE(source); - d = STORAGE_COMPRESSED(destination); - - debug("tensor_storage_convert_from_coordinate_to_compressed(destination=0x%x, source=0x%x)\n", destination, source); - - base = STORAGE_BASE(destination); - nnz = source->nnz; - n = source->n; - values = source->values; - tuples = s->tuples; - - qsort(tuples, nnz, sizeof(coordinate_tuple_t), base->callbacks->index_compare); - d->rn = tensor_storage_index_encode(d->RO, n, tuples, nnz, base->callbacks->index_r_encoder); - tensor_storage_copy(d, s, nnz, base->callbacks->index_copy); - tensor_storage_copy(destination, source, nnz, (index_copy_t) &copier_for_values); -} - -tensor_storage_compressed_t* -tensor_storage_malloc_compressed(tensor_t const *tensor) -{ - tensor_storage_base_t *base; - tensor_storage_compressed_t *storage; - conversion_callbacks_t *callbacks; - - superfluous("tensor_storage_malloc_compressed(tensor=0x%x)\n", tensor); - - storage = MALLOC(tensor_storage_compressed_t); - storage->rn = 0; - storage->cn = tensor->nnz; - storage->kn = tensor->nnz; - storage->RO = NULL; - storage->CO = MALLOC_N(uint, storage->cn); - storage->TO = MALLOC_N(uint, storage->cn); - storage->KO = MALLOC_N(uint, storage->kn); - - debug("tensor_storage_malloc_compressed: rn=%d, kn=%d\n", storage->rn, storage->kn); - - callbacks = MALLOC(conversion_callbacks_t); - callbacks->index_compare = NULL; - callbacks->index_r_encoder = NULL; - callbacks->index_copy = NULL; - - switch (tensor->orientation) { - case orientation::row: - storage->rn = tensor->m; - callbacks->index_compare = (index_compare_t) &index_compare_ikj; - callbacks->index_r_encoder = &encoder_for_i; - callbacks->index_copy = (index_copy_t) &copier_for_row; - break; - case orientation::column: - storage->rn = tensor->n; - callbacks->index_compare = (index_compare_t) &index_compare_jki; - callbacks->index_r_encoder = &encoder_for_j; - callbacks->index_copy = (index_copy_t) &copier_for_column; - break; - case orientation::tube: - storage->rn = tensor->l; - callbacks->index_compare = (index_compare_t) &index_compare_ijk; - callbacks->index_r_encoder = &encoder_for_i; - callbacks->index_copy = (index_copy_t) &copier_for_tube; - break; - default: - die("tensor_storage_malloc_compressed: " - "unknown or unsupported orientation %d.\n", - tensor->orientation); - break; - } - - storage->rn += 1; - storage->RO = MALLOC_N(uint, storage->rn); - base = (tensor_storage_base_t*) storage; - base->callbacks = callbacks; - - superfluous("tensor_storage_malloc_compressed: callbacks=0x%x\n", callbacks); - superfluous("tensor_storage_malloc_compressed: storage->CO=0x%x\n", storage->CO); - superfluous("tensor_storage_malloc_compressed: storage->KO=0x%x\n", storage->KO); - superfluous("tensor_storage_malloc_compressed: storage->size (of RO)=%d\n", storage->rn); - superfluous("tensor_storage_malloc_compressed: storage->RO=0x%x\n", storage->RO); - superfluous("tensor_storage_malloc_compressed: storage=0x%x\n", storage); - - return storage; -} diff --git a/src/tensor_storage_compressed_slice.cc b/src/tensor_storage_compressed_slice.cc deleted file mode 100644 index dc2e83f..0000000 --- a/src/tensor_storage_compressed_slice.cc +++ /dev/null @@ -1,122 +0,0 @@ - -#include "error.h" -#include "memory.h" -#include "mmio.h" -#include "tensor.h" -#include "storage.h" -#include "utility.h" -#include -#include - -static uint g_n; - -void -copier_for_slice_lateral(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i) -{ - destination->KO[i] = source->tuples[i].i * g_n + source->tuples[i].k; -} - -void -copier_for_slice_horizontal(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i) -{ - destination->KO[i] = source->tuples[i].j * g_n + source->tuples[i].k; -} - -void -copier_for_slice_frontal(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i) -{ - destination->KO[i] = source->tuples[i].i * g_n + source->tuples[i].j; - -#if 0 - debug("copier_for_slice_frontal: KO[i=%u]=(i=%u) * (n=%u) + (j=%u)=%u\n", - i, source->tuples[i].i, g_n, source->tuples[i].j, destination->KO[i]); -#endif -} - -void -tensor_storage_convert_from_coordinate_to_compressed_slice(tensor_t *destination, tensor_t *source) -{ - uint n, nnz; - tensor_storage_base_t *base; - tensor_storage_compressed_t *d; - tensor_storage_coordinate_t *s; - coordinate_tuple_t *tuples; - double *values; - - s = STORAGE_COORIDINATE(source); - d = STORAGE_COMPRESSED(destination); - - debug("tensor_storage_convert_from_coordinate_to_compressed_slice(destination=0x%x, source=0x%x)\n", destination, source); - - base = STORAGE_BASE(destination); - nnz = source->nnz; - n = source->n; - values = source->values; - tuples = s->tuples; - g_n = source->n; - - qsort(tuples, nnz, sizeof(coordinate_tuple_t), base->callbacks->index_compare); - d->rn = tensor_storage_index_encode(d->RO, n, tuples, nnz, base->callbacks->index_r_encoder); - tensor_storage_copy(d, s, nnz, base->callbacks->index_copy); - tensor_storage_copy(destination, source, nnz, (index_copy_t) &copier_for_values); -} - -tensor_storage_compressed_t* -tensor_storage_malloc_compressed_slice(tensor_t const *tensor) -{ - tensor_storage_base_t *base; - tensor_storage_compressed_t *storage; - conversion_callbacks_t *callbacks; - - superfluous("tensor_storage_malloc_compressed_slice(tensor=0x%x)\n", tensor); - - storage = MALLOC(tensor_storage_compressed_t); - storage->rn = tensor->n * tensor->n + 1; - storage->kn = tensor->nnz; - storage->RO = MALLOC_N(uint, storage->rn); - storage->CO = NULL; - storage->TO = NULL; - storage->KO = MALLOC_N(uint, storage->kn); - - debug("tensor_storage_malloc_compressed_slice: rn=%d, kn=%d\n", storage->rn, storage->kn); - - callbacks = MALLOC(conversion_callbacks_t); - callbacks->index_compare = NULL; - callbacks->index_r_encoder = NULL; - callbacks->index_copy = NULL; - - switch (tensor->orientation) { - case orientation::lateral: - callbacks->index_compare = (index_compare_t) &index_compare_jik; - callbacks->index_r_encoder = &encoder_for_j; - callbacks->index_copy = (index_copy_t) &copier_for_slice_lateral; - break; - case orientation::horizontal: - callbacks->index_compare = (index_compare_t) &index_compare_ijk; - callbacks->index_r_encoder = &encoder_for_i; - callbacks->index_copy = (index_copy_t) &copier_for_slice_horizontal; - break; - case orientation::frontal: - callbacks->index_compare = (index_compare_t) &index_compare_kij; - callbacks->index_r_encoder = &encoder_for_k; - callbacks->index_copy = (index_copy_t) &copier_for_slice_frontal; - break; - default: - die("tensor_storage_malloc_compressed_slice: " - "unknown or unsupported orientation %d.\n", - tensor->orientation); - break; - } - - base = (tensor_storage_base_t*) storage; - base->callbacks = callbacks; - - superfluous("tensor_storage_malloc_compressed_slice: callbacks=0x%x\n", callbacks); - superfluous("tensor_storage_malloc_compressed_slice: storage->RO=0x%x\n", storage->RO); - superfluous("tensor_storage_malloc_compressed_slice: storage->CO=0x%x\n", storage->CO); - superfluous("tensor_storage_malloc_compressed_slice: storage->TO=0x%x\n", storage->TO); - superfluous("tensor_storage_malloc_compressed_slice: storage->KO=0x%x\n", storage->KO); - superfluous("tensor_storage_malloc_compressed_slice: storage=0x%x\n", storage); - - return storage; -} diff --git a/src/tensor_storage_convert.cc b/src/tensor_storage_convert.cc deleted file mode 100644 index 91202c6..0000000 --- a/src/tensor_storage_convert.cc +++ /dev/null @@ -1,230 +0,0 @@ - -#include "error.h" -#include "memory.h" -#include "mmio.h" -#include "storage.h" -#include "tensor.h" -#include "utility.h" -#include -#include - -void -convert_from_compressed_to_coordinate(tensor_t *destination, tensor_t *source) -{ - debug("convert_from_compressed_to_coordinate(destination=0x%x, source=0x%x)\n", destination, source); - - tensor_storage_convert_from_compressed_to_coordinate(destination, source); - -#if 0 - switch (destination->orientation) { - case orientation::tube: - tensor_storage_convert_from_compressed_to_coordinate(destination, source); - break; - case orientation::row: - case orientation::column: - tensor_storage_convert_from_compressed_to_coordinate(destination, source); - break; - case orientation::lateral: - case orientation::horizontal: - case orientation::frontal: - tensor_storage_convert_from_compressed_slice_to_coordinate(destination, source); - break; - default: - die("Conversion to orientation '%s' (%d) is not currently supported.\n", - orientation_to_string(destination->orientation), destination->orientation); - break; - } -#endif -} - -void -convert_to_coordinate(tensor_t *destination, tensor_t *source) -{ - debug("convert_to_coordinate(destination=0x%x, source=0x%x)\n", destination, source); - - switch (source->strategy) { - case strategy::compressed: - case strategy::slice: - convert_from_compressed_to_coordinate(destination, source); - break; - default: - die("Conversion from '%s' strategy to '%s' is not currently supported.\n", - strategy_to_string(source->strategy), - strategy_to_string(destination->strategy)); - break; - } -} - -void -convert_from_coordinate_to_compressed(tensor_t *destination, tensor_t *source) -{ - debug("convert_from_coordinate_to_compressed(destination=0x%x, source=0x%x)\n", destination, source); - - switch (destination->orientation) { - case orientation::row: - case orientation::column: - case orientation::tube: - tensor_storage_convert_from_coordinate_to_compressed(destination, source); - break; - case orientation::lateral: - case orientation::horizontal: - case orientation::frontal: - tensor_storage_convert_from_coordinate_to_compressed_slice(destination, source); - break; - default: - die("Conversion to orientation '%s' is not currently supported.\n", - orientation_to_string(destination->orientation)); - break; - } -} - -void -convert_from_coordinate_to_gundersen(tensor_t *destination, tensor_t *source) -{ - debug("convert_from_coordinate_to_gundersen(destination=0x%x, source=0x%x)\n", destination, source); - - switch (destination->orientation) { - case orientation::tube: - tensor_storage_convert_from_coordinate_to_gundersen(destination, source); - break; - default: - die("Conversion to orientation '%s' is not currently supported.\n", - orientation_to_string(destination->orientation)); - break; - } -} - -void -convert_from_coordinate_to_ekmr(tensor_t *destination, tensor_t *source) -{ - debug("convert_from_coordinate_to_ekmr(destination=0x%x, source=0x%x)\n", destination, source); - - switch (destination->orientation) { - case orientation::row: - case orientation::column: - case orientation::tube: - tensor_storage_convert_from_coordinate_to_ekmr(destination, source); - break; - default: - die("Conversion to orientation '%s' is not currently supported.\n", - orientation_to_string(destination->orientation)); - break; - } -} - -void -convert_from_coordinate_to_zzekmr(tensor_t *destination, tensor_t *source) -{ - debug("convert_from_coordinate_to_zzekmr(destination=0x%x, source=0x%x)\n", destination, source); - - switch (destination->orientation) { - case orientation::row: - case orientation::column: - case orientation::tube: - tensor_storage_convert_from_coordinate_to_zzekmr(destination, source); - break; - default: - die("Conversion to orientation '%s' is not currently supported.\n", - orientation_to_string(destination->orientation)); - break; - } -} - -void -convert_to_compressed(tensor_t *destination, tensor_t *source) -{ - debug("convert_to_compressed(destination=0x%x, source=0x%x)\n", destination, source); - - switch (source->strategy) { - case strategy::coordinate: - convert_from_coordinate_to_compressed(destination, source); - break; - default: - die("Conversion from '%s' strategy to '%s' is not currently supported.\n", - strategy_to_string(source->strategy), - strategy_to_string(destination->strategy)); - break; - } -} - -void -convert_to_gundersen(tensor_t *destination, tensor_t *source) -{ - debug("convert_to_gundersen(destination=0x%x, source=0x%x)\n", destination, source); - - switch (source->strategy) { - case strategy::coordinate: - convert_from_coordinate_to_gundersen(destination, source); - break; - default: - die("Conversion from '%s' strategy to '%s' is not currently supported.\n", - strategy_to_string(source->strategy), - strategy_to_string(destination->strategy)); - break; - } -} - -void -convert_to_ekmr(tensor_t *destination, tensor_t *source) -{ - debug("convert_to_ekmr(destination=0x%x, source=0x%x)\n", destination, source); - - switch (source->strategy) { - case strategy::coordinate: - convert_from_coordinate_to_ekmr(destination, source); - break; - default: - die("Conversion from '%s' strategy to '%s' is not currently supported.\n", - strategy_to_string(source->strategy), - strategy_to_string(destination->strategy)); - break; - } -} - -void -convert_to_zzekmr(tensor_t *destination, tensor_t *source) -{ - debug("convert_to_zzekmr(destination=0x%x, source=0x%x)\n", destination, source); - - switch (source->strategy) { - case strategy::coordinate: - convert_from_coordinate_to_zzekmr(destination, source); - break; - default: - die("Conversion from '%s' strategy to '%s' is not currently supported.\n", - strategy_to_string(source->strategy), - strategy_to_string(destination->strategy)); - break; - } -} - -void -tensor_storage_convert(tensor_t *destination, tensor_t *source) -{ - debug("tensor_storage_convert(destination=0x%x, source=0x%x)\n", destination, source); - - switch (destination->strategy) { - case strategy::coordinate: - convert_to_coordinate(destination, source); - break; - case strategy::compressed: - case strategy::slice: - convert_to_compressed(destination, source); - break; - case strategy::gundersen: - convert_to_gundersen(destination, source); - break; - case strategy::ekmr: - convert_to_ekmr(destination, source); - break; - case strategy::zzekmr: - convert_to_zzekmr(destination, source); - break; - default: - die("Conversion from '%s' strategy to '%s' is not currently supported.\n", - strategy_to_string(source->strategy), - strategy_to_string(destination->strategy)); - break; - } -} - diff --git a/src/tensor_storage_coordinate.cc b/src/tensor_storage_coordinate.cc deleted file mode 100644 index c82e955..0000000 --- a/src/tensor_storage_coordinate.cc +++ /dev/null @@ -1,134 +0,0 @@ - -#include "error.h" -#include "memory.h" -#include "mmio.h" -#include "tensor.h" -#include "storage.h" -#include "utility.h" -#include -#include - -void -tensor_storage_convert_from_compressed_tube_to_coordinate(tensor_t *destination, tensor_t *source) -{ - uint i, t, r0, r; - uint n, rn, nnz; - tensor_storage_coordinate_t *d; - tensor_storage_compressed_t *s; - coordinate_tuple_t *T; - double *V; - uint *R, *C, *K; - - s = STORAGE_COMPRESSED(source); - d = STORAGE_COORIDINATE(destination); - - debug("tensor_storage_convert_from_compressed_tube_to_coordinate(destination=0x%x, source=0x%x)\n", destination, source); - - nnz = source->nnz; - T = d->tuples; - - n = source->n; - rn = s->rn; - R = s->RO; - C = s->CO; - K = s->KO; - V = source->values; - - for (r = 1, t = 0; r < rn; ++r) { - r0 = r-1; - for (i = R[r0]; i < R[r]; ++i, ++t) { - T[t].i = r0; - T[t].j = K[i]; - T[t].k = C[i]; - T[t].index = i; - - } - } - - for (i = 0; i < nnz; ++i) { - destination->values[i] = source->values[i]; - } -} - -void -tensor_storage_convert_from_compressed_slice_to_coordinate(tensor_t *destination, tensor_t *source) -{ - uint i, t, r0, r; - uint n, rn, nnz; - tensor_storage_coordinate_t *d; - tensor_storage_compressed_t *s; - coordinate_tuple_t *T; - double *V; - uint *R, *C, *K; - - s = STORAGE_COMPRESSED(source); - d = STORAGE_COORIDINATE(destination); - - debug("tensor_storage_convert_from_compressed_slice_to_coordinate(destination=0x%x, source=0x%x)\n", destination, source); - - nnz = source->nnz; - T = d->tuples; - - n = source->n; - rn = s->rn; - R = s->RO; - C = s->CO; - K = s->KO; - V = source->values; - - for (r = 1, t = 0; r < rn; ++r) { - r0 = r-1; - DEBUG("R[r0=%u]=%u, R[r=%u]=%u\n", r0, R[r0], r, R[r]); - for (i = R[r0]; i < R[r]; ++i, ++t) { - DEBUG("K[i=%u]=%u\n", i, K[i]); - T[t].i = K[i] / n; - T[t].j = K[i] % n; - T[t].k = r0 % n; - T[t].index = i; - DEBUG("i=%u, j=%u, k=%u, index=%u\n", T[t].i, T[t].j, T[t].k, T[t].index); - } - } - - for (i = 0; i < nnz; ++i) { - destination->values[i] = source->values[i]; - } -} - -void -tensor_storage_convert_from_compressed_to_coordinate(tensor_t *destination, tensor_t *source) -{ - debug("tensor_storage_convert_from_compressed_to_coordinate(destination=0x%x, source=0x%x)\n", destination, source); - - switch (source->strategy) { - case strategy::compressed: - tensor_storage_convert_from_compressed_tube_to_coordinate(destination, source); - break; - case strategy::slice: - tensor_storage_convert_from_compressed_slice_to_coordinate(destination, source); - break; - default: - die("tensor_storage_convert_from_compressed_to_coordinate: " - "unknown or unsupported strategy %d.\n", - source->strategy); - break; - } -} - -tensor_storage_coordinate_t* -tensor_storage_malloc_coordinate(tensor_t const *tensor) -{ - tensor_storage_base_t *base; - tensor_storage_coordinate_t *storage; - - superfluous("tensor_storage_malloc_coordinate(tensor=0x%x, nnz=%d)\n", tensor, tensor->nnz); - - storage = MALLOC(tensor_storage_coordinate_t); - storage->tuples = MALLOC_N(coordinate_tuple_t, tensor->nnz); - base = (tensor_storage_base_t*) storage; - base->callbacks = NULL; - - superfluous("tensor_storage_malloc_coordinate: storage->tuples=0x%x\n", storage->tuples); - superfluous("tensor_storage_malloc_coordinate: storage=0x%x\n", storage); - - return storage; -} diff --git a/src/tensor_storage_ekmr.cc b/src/tensor_storage_ekmr.cc deleted file mode 100644 index 3b64028..0000000 --- a/src/tensor_storage_ekmr.cc +++ /dev/null @@ -1,100 +0,0 @@ - -#include "error.h" -#include "memory.h" -#include "mmio.h" -#include "tensor.h" -#include "storage.h" -#include "utility.h" -#include -#include - -void -index_copy_for_ekmr_row(void *destination, void const *source, uint nnz) -{ - uint i, n; - tensor_storage_coordinate_t const *s; - tensor_storage_extended_t *d; - - s = (tensor_storage_coordinate_t const*) source; - d = (tensor_storage_extended_t*) destination; - n = d->rn - 1; - - debug("index_copy_for_ekmr_row(destination=0x%x, source=0x%x, nnz=%d)\n", d, s, nnz); - - for (i = 0; i < nnz; ++i) { - d->CK[i] = s->tuples[i].j * n + s->tuples[i].k; - } -} - -void -tensor_storage_convert_from_coordinate_to_ekmr(tensor_t *destination, tensor_t *source) -{ - int n, nnz; - tensor_storage_base_t *base; - tensor_storage_extended_t *d; - tensor_storage_coordinate_t *s; - coordinate_tuple_t *tuples; - double *values; - - s = STORAGE_COORIDINATE(source); - d = STORAGE_EXTENDED(destination); - - debug("tensor_storage_convert_from_coordinate_to_ekmr(destination=0x%x, source=0x%x)\n", d, s); - - base = STORAGE_BASE(destination); - nnz = source->nnz; - n = source->n; - values = source->values; - tuples = s->tuples; - - qsort(tuples, nnz, sizeof(coordinate_tuple_t), base->callbacks->index_compare); - d->rn = tensor_storage_index_encode(d->RO, n, tuples, nnz, base->callbacks->index_r_encoder); - tensor_storage_copy(d, s, nnz, base->callbacks->index_copy); - tensor_storage_copy(destination, source, nnz, (index_copy_t) &copier_for_values); -} - -tensor_storage_extended_t* -tensor_storage_malloc_ekmr(tensor_t const *tensor) -{ - tensor_storage_base_t *base; - tensor_storage_extended_t *storage; - conversion_callbacks_t *callbacks; - - superfluous("tensor_storage_malloc_ekmr(tensor=0x%x)\n", tensor); - - storage = MALLOC(tensor_storage_extended_t); - storage->rn = 0; - storage->ckn = tensor->nnz; - storage->RO = NULL; - storage->CK = MALLOC_N(uint, storage->ckn); - - callbacks = MALLOC(conversion_callbacks_t); - callbacks->index_compare = NULL; - callbacks->index_r_encoder = NULL; - callbacks->index_copy = NULL; - - switch (tensor->orientation) { - case orientation::row: - storage->rn = tensor->n; - callbacks->index_compare = (index_compare_t) &index_compare_ijk; - callbacks->index_r_encoder = &encoder_for_i; - callbacks->index_copy = &index_copy_for_ekmr_row; - break; - default: - die("Tensor orientation '%s' not yet supported.\n", orientation_to_string(tensor->orientation)); - break; - } - - storage->rn += 1; - storage->RO = MALLOC_N(uint, storage->rn); - base = (tensor_storage_base_t*) storage; - base->callbacks = callbacks; - - superfluous("tensor_storage_malloc_ekmr: callbacks=0x%x\n", callbacks); - superfluous("tensor_storage_malloc_ekmr: storage->CK=0x%x\n", storage->CK); - superfluous("tensor_storage_malloc_ekmr: storage->size (of R)=%d\n", storage->rn); - superfluous("tensor_storage_malloc_ekmr: storage->RO=0x%x\n", storage->RO); - superfluous("tensor_storage_malloc_ekmr: storage=0x%x\n", storage); - - return storage; -} diff --git a/src/tensor_storage_gundersen.cc b/src/tensor_storage_gundersen.cc deleted file mode 100644 index c33f97c..0000000 --- a/src/tensor_storage_gundersen.cc +++ /dev/null @@ -1,136 +0,0 @@ - -#include "error.h" -#include "memory.h" -#include "mmio.h" -#include "tensor.h" -#include "storage.h" -#include "utility.h" -#include -#include - -void -tensor_storage_convert_from_coordinate_to_gundersen(tensor_t *destination, tensor_t *source) -{ - uint nnz; - uint rn, cn, index, current, prev_ri, prev_ci; - tensor_storage_base_t *base; - tensor_storage_compressed_t *d; - tensor_storage_coordinate_t *s; - coordinate_tuple_t *tuples; - double *values; - uint *R, *C, *K; - index_encoder_t r_encoder, c_encoder; - - debug("tensor_storage_convert_from_coordinate_to_gundersen(destination=0x%x, source=0x%x)\n", destination, source); - - nnz = source->nnz; - values = source->values; - - base = STORAGE_BASE(destination); - r_encoder = base->callbacks->index_r_encoder; - c_encoder = base->callbacks->index_c_encoder; - - s = STORAGE_COORIDINATE(source); - d = STORAGE_COMPRESSED(destination); - tuples = s->tuples; - R = d->RO; - C = d->CO; - K = d->KO; - - qsort(tuples, nnz, sizeof(coordinate_tuple_t), base->callbacks->index_compare); - tensor_storage_copy(d, s, nnz, base->callbacks->index_copy); - tensor_storage_copy(destination, source, nnz, (index_copy_t) &copier_for_values); - - rn = 0; - cn = 0; - prev_ri = r_encoder(&tuples[0]); - prev_ci = c_encoder(&tuples[0]); - - C[cn++] = 0; - R[rn++] = 1; - - for (current = 0; current < nnz; ++current) { - DEBUG("i=%u, j=%u, k=%u, index=%u\n", - tuples[current].i, tuples[current].j, - tuples[current].k, tuples[current].index); - index = base->callbacks->index_c_encoder(&tuples[current]); - if (prev_ci != index) { - DEBUG("C[size=%u]=%u\n", cn, current); - C[cn++] = current; - prev_ci = index; - } - index = base->callbacks->index_r_encoder(&tuples[current]); - if (prev_ri != index) { - DEBUG("R[size=%u]=%u\n", rn, current); - R[rn++] = cn; - prev_ri = index; - } - } - - DEBUG("C[size=%u]=%u\n", cn, nnz); - DEBUG("R[size=%u]=%u\n", rn, cn); - - C[cn++] = nnz; - R[rn++] = cn; - - DEBUG("rn=%u\n", rn); - DEBUG("cn=%u\n", cn); - - d->rn = rn; - d->cn = cn; -} - -tensor_storage_compressed_t* -tensor_storage_compressed_gundersen(tensor_t const *tensor) -{ - tensor_storage_base_t *base; - tensor_storage_compressed_t *storage; - conversion_callbacks_t *callbacks; - - superfluous("tensor_storage_compressed_gundersen(tensor=0x%x)\n", tensor); - - storage = MALLOC(tensor_storage_compressed_t); - storage->rn = tensor->n + 1; - storage->cn = tensor->n * tensor->n + 1; - storage->tn = tensor->n * tensor->n + 1; - storage->kn = tensor->nnz; - storage->RO = MALLOC_N(uint, storage->rn); - storage->CO = MALLOC_N(uint, storage->cn); - storage->TO = MALLOC_N(uint, storage->tn); - storage->KO = MALLOC_N(uint, storage->kn); - - debug("tensor_storage_compressed_gundersen: rn=%d, cn=%d, tn=%d, kn=%d\n", - storage->rn, storage->cn, storage->tn, storage->kn); - - callbacks = MALLOC(conversion_callbacks_t); - callbacks->index_compare = NULL; - callbacks->index_r_encoder = NULL; - callbacks->index_c_encoder = NULL; - callbacks->index_copy = NULL; - - switch (tensor->orientation) { - case orientation::tube: - callbacks->index_compare = (index_compare_t) &index_compare_jik; - callbacks->index_r_encoder = &encoder_for_j; - callbacks->index_c_encoder = &encoder_for_i; - callbacks->index_copy = (index_copy_t) &copier_for_k; - break; - default: - die("tensor_storage_compressed_gundersen: " - "unknown or unsupported orientation %d.\n", - tensor->orientation); - break; - } - - base = (tensor_storage_base_t*) storage; - base->callbacks = callbacks; - - superfluous("tensor_storage_compressed_gundersen: callbacks=0x%x\n", callbacks); - superfluous("tensor_storage_malloc_compressed: storage->CO=0x%x\n", storage->CO); - superfluous("tensor_storage_malloc_compressed: storage->KO=0x%x\n", storage->KO); - superfluous("tensor_storage_compressed_gundersen: storage->size (of RO)=%d\n", storage->rn); - superfluous("tensor_storage_compressed_gundersen: storage->RO=0x%x\n", storage->RO); - superfluous("tensor_storage_compressed_gundersen: storage=0x%x\n", storage); - - return storage; -} diff --git a/src/tensor_storage_malloc.cc b/src/tensor_storage_malloc.cc deleted file mode 100644 index 9abd3f7..0000000 --- a/src/tensor_storage_malloc.cc +++ /dev/null @@ -1,44 +0,0 @@ - -#include "error.h" -#include "memory.h" -#include "mmio.h" -#include "storage.h" -#include "tensor.h" -#include "utility.h" -#include -#include - -void* -tensor_storage_malloc(tensor_t const *tensor) -{ - void *storage; - - superfluous("tensor_storage_malloc(tensor=0x%x [strategy='%s'])\n", - tensor, strategy_to_string(tensor->strategy)); - - storage = NULL; - - switch (tensor->strategy) { - case strategy::coordinate: - storage = tensor_storage_malloc_coordinate(tensor); - break; - case strategy::compressed: - storage = tensor_storage_malloc_compressed(tensor); - break; - case strategy::slice: - storage = tensor_storage_malloc_compressed_slice(tensor); - break; - case strategy::ekmr: - storage = tensor_storage_malloc_ekmr(tensor); - break; - case strategy::zzekmr: - storage = tensor_storage_malloc_zzekmr(tensor); - break; - default: - die("Tensor storage strategy '%d' is not supported.\n", tensor->strategy); - } - - superfluous("tensor_storage_malloc: storage=0x%x\n", storage); - - return storage; -} diff --git a/src/tensor_storage_matrix_slice.cc b/src/tensor_storage_matrix_slice.cc deleted file mode 100644 index dde6577..0000000 --- a/src/tensor_storage_matrix_slice.cc +++ /dev/null @@ -1,117 +0,0 @@ - -#include "error.h" -#include "memory.h" -#include "mmio.h" -#include "tensor.h" -#include "storage.h" -#include "utility.h" -#include -#include - -static uint g_n; - -static void -copier_for_slice_lateral(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i) -{ - destination->KO[i] = source->tuples[i].i * g_n + source->tuples[i].k; -} - -static void -copier_for_slice_horizontal(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i) -{ - destination->KO[i] = source->tuples[i].j * g_n + source->tuples[i].k; -} - -static void -copier_for_slice_frontal(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i) -{ - destination->KO[i] = source->tuples[i].i * g_n + source->tuples[i].j; -} - -void -tensor_storage_convert_from_coordinate_to_matrix_slice(tensor_t *destination, tensor_t *source) -{ - uint n, nnz; - tensor_storage_base_t *base; - tensor_storage_compressed_t *d; - tensor_storage_coordinate_t *s; - coordinate_tuple_t *tuples; - double *values; - - s = STORAGE_COORIDINATE(source); - d = STORAGE_COMPRESSED(destination); - - debug("tensor_storage_convert_from_coordinate_to_matrix_slice(destination=0x%x, source=0x%x)\n", destination, source); - - base = STORAGE_BASE(destination); - nnz = source->nnz; - n = source->n; - values = source->values; - tuples = s->tuples; - g_n = source->n; - - qsort(tuples, nnz, sizeof(coordinate_tuple_t), base->callbacks->index_compare); - d->rn = tensor_storage_index_encode(d->RO, n, tuples, nnz, base->callbacks->index_r_encoder); - tensor_storage_copy(d, s, nnz, base->callbacks->index_copy); - tensor_storage_copy(destination, source, nnz, (index_copy_t) &copier_for_values); -} - -tensor_storage_compressed_t* -tensor_storage_malloc_matrix_slice(tensor_t const *tensor) -{ - tensor_storage_base_t *base; - tensor_storage_compressed_t *storage; - conversion_callbacks_t *callbacks; - - superfluous("tensor_storage_malloc_matrix_slice(tensor=0x%x)\n", tensor); - - storage = MALLOC(tensor_storage_compressed_t); - storage->rn = tensor->n * tensor->n + 1; - storage->kn = tensor->nnz; - storage->RO = MALLOC_N(uint, storage->rn); - storage->CO = NULL; - storage->TO = NULL; - storage->KO = MALLOC_N(uint, storage->kn); - - debug("tensor_storage_malloc_matrix_slice: rn=%d, kn=%d\n", storage->rn, storage->kn); - - callbacks = MALLOC(conversion_callbacks_t); - callbacks->index_compare = NULL; - callbacks->index_r_encoder = NULL; - callbacks->index_copy = NULL; - - switch (tensor->orientation) { - case orientation::lateral: - callbacks->index_compare = (index_compare_t) &index_compare_jik; - callbacks->index_r_encoder = &encoder_for_j; - callbacks->index_copy = (index_copy_t) &copier_for_slice_lateral; - break; - case orientation::horizontal: - callbacks->index_compare = (index_compare_t) &index_compare_ijk; - callbacks->index_r_encoder = &encoder_for_i; - callbacks->index_copy = (index_copy_t) &copier_for_slice_horizontal; - break; - case orientation::frontal: - callbacks->index_compare = (index_compare_t) &index_compare_kij; - callbacks->index_r_encoder = &encoder_for_k; - callbacks->index_copy = (index_copy_t) &copier_for_slice_frontal; - break; - default: - die("tensor_storage_malloc_matrix_slice: " - "unknown or unsupported orientation %d.\n", - tensor->orientation); - break; - } - - base = (tensor_storage_base_t*) storage; - base->callbacks = callbacks; - - superfluous("tensor_storage_malloc_matrix_slice: callbacks=0x%x\n", callbacks); - superfluous("tensor_storage_malloc_matrix_slice: storage->RO=0x%x\n", storage->RO); - superfluous("tensor_storage_malloc_matrix_slice: storage->CO=0x%x\n", storage->CO); - superfluous("tensor_storage_malloc_matrix_slice: storage->TO=0x%x\n", storage->TO); - superfluous("tensor_storage_malloc_matrix_slice: storage->KO=0x%x\n", storage->KO); - superfluous("tensor_storage_malloc_matrix_slice: storage=0x%x\n", storage); - - return storage; -} diff --git a/src/tensor_storage_utility.cc b/src/tensor_storage_utility.cc deleted file mode 100644 index aadc198..0000000 --- a/src/tensor_storage_utility.cc +++ /dev/null @@ -1,198 +0,0 @@ - -#include "error.h" -#include "storage.h" -#include - -int -index_compare_ijk(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb) -{ - int result; - - if (0 == (result = ta->i - tb->i)) { - if (0 == (result = ta->j - tb->j)) { - result = ta->k - tb->k; - } - } - - return result; -} - -int -index_compare_jik(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb) -{ - int result; - - if (0 == (result = ta->j - tb->j)) { - if (0 == (result = ta->i - tb->i)) { - result = ta->k - tb->k; - } - } - - return result; -} - -int -index_compare_jki(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb) -{ - int result; - - if (0 == (result = ta->j - tb->j)) { - if (0 == (result = ta->k - tb->k)) { - result = ta->i - tb->i; - } - } - - return result; -} - -int -index_compare_kji(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb) -{ - int result; - - if (0 == (result = ta->k - tb->k)) { - if (0 == (result = ta->j - tb->j)) { - result = ta->i - tb->i; - } - } - - return result; -} - -int -index_compare_kij(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb) -{ - int result; - - if (0 == (result = ta->k - tb->k)) { - if (0 == (result = ta->i - tb->i)) { - result = ta->j - tb->j; - } - } - - return result; -} - -int -index_compare_ikj(coordinate_tuple_t const *ta, coordinate_tuple_t const *tb) -{ - int result; - - if (0 == (result = ta->i - tb->i)) { - if (0 == (result = ta->k - tb->k)) { - result = ta->j - tb->j; - } - } - - return result; -} - - -uint -encoder_for_i(coordinate_tuple_t const *tuple) -{ - return tuple->i; -} - -uint -encoder_for_j(coordinate_tuple_t const *tuple) -{ - return tuple->j; -} - -uint -encoder_for_k(coordinate_tuple_t const *tuple) -{ - return tuple->k; -} - -uint -tensor_storage_index_encode(uint *indices, uint n, coordinate_tuple_t const *tuple, uint nnz, index_encoder_t encoder) -{ - uint i, t; - uint index; - - debug("tensor_storage_index_encode(indices=0x%x, tuple=0x%x, nnz=%d)\n", indices, tuple, nnz); - -#if 0 - for (current = 0; current < nnz; ++current) { - DEBUG("current=%u: i=%u, j=%u, k=%u, index=%u\n", - current, tuple[current].i, tuple[current].j, - tuple[current].k, tuple[current].index); - index = encoder(&tuple[current]); - if (previous != index) { - DEBUG("indices[size=%u]=%u\n", size, current); - indices[size++] = current; - previous = index; - } - } - - DEBUG("indices[size=%u]=%u\n", size, nnz); - indices[size++] = nnz; - DEBUG("size=%u\n", size); -#endif - - indices[0] = 0; - index = encoder(&tuple[0]); - - for (i = 1; i < index; ++i) { - indices[i] = 0; - } - - for (t = 0; t < nnz; ++t) { - DEBUG("t=%u: i=%u, j=%u, k=%u, index=%u\n", t, tuple[t].i, tuple[t].j, tuple[t].k, tuple[t].index); - index = encoder(&tuple[t]); - if (i != index) { - DEBUG("indices[i=%u]=%u\n", i, t); - for (; i < index; ++i) { - indices[i] = t; - } - } - } - - for (; i < n; ++i) { - indices[i] = nnz; - } - - DEBUG("indices[i=%u]=%u\n", i, nnz); - indices[i++] = nnz; - DEBUG("i=%u\n", i); - - return i; -} - -void -copier_for_i(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i) -{ - destination->KO[i] = source->tuples[i].i; -} - -void -copier_for_j(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i) -{ - destination->KO[i] = source->tuples[i].j; -} - -void -copier_for_k(tensor_storage_compressed_t *destination, tensor_storage_coordinate_t const *source, uint i) -{ - destination->KO[i] = source->tuples[i].k; -} - -void -copier_for_values(tensor_t *destination, tensor_t const *source, uint i) -{ - destination->values[i] = source->values[STORAGE_COORIDINATE(source)->tuples[i].index]; -} - -void -tensor_storage_copy(void *destination, void const *source, uint nnz, index_copy_t copier) -{ - uint i; - - debug("storage_index_copy(destination=0x%x, source=0x%x)\n", destination, source); - - for (i = 0; i < nnz; ++i) { - copier(destination, source, i); - } -} diff --git a/src/tensor_storage_zzekmr.cc b/src/tensor_storage_zzekmr.cc deleted file mode 100644 index 68cc65e..0000000 --- a/src/tensor_storage_zzekmr.cc +++ /dev/null @@ -1,140 +0,0 @@ - -#include "error.h" -#include "memory.h" -#include "mmio.h" -#include "tensor.h" -#include "storage.h" -#include "utility.h" -#include -#include - -static uint g_r; - -int -tensor_storage_index_compare_for_zzekmr_row(void const *a, void const *b) -{ - uint ja, jb; - int result; - coordinate_tuple_t const *ta, *tb; - - ta = (coordinate_tuple_t const*) a; - tb = (coordinate_tuple_t const*) b; - ja = ta->j * g_r + ta->k; - jb = tb->j * g_r + tb->k; - - /* We are doing just about exacly what the EKMR encoding does, - except we reverse the order of every other row (we assume, for - now, that all rows are non-empty). We do this for a very simple, - but elegant reason. Take for instance the dense vector, sparse - tensor product: Say we are at the end of a row, in terms of doing - an operation. The cache will have been primed with the elemets - of one of the extremes of the vector. If we naively pull in the - next row, we must also pull in the matching vector elements. - However, if we bring in the next row *in reverse order*, we will - likely already have the matching vector elements in the - cache. Thus, in general, we may not nessearily invalidate the - existing cache lines. */ - - if (0 == (result = ta->i - tb->i)) { - if (ta->i % 2) { /* odd */ - result = jb - ja; - } else { /* even */ - result = ja - jb; - } - } - - return result; -} - -void -tensor_storage_index_copy_for_zzekmr_row(void *destination, void const *source, uint nnz) -{ - uint i; - tensor_storage_coordinate_t const *s; - tensor_storage_extended_t *d; - - s = (tensor_storage_coordinate_t const*) source; - d = (tensor_storage_extended_t*) destination; - - debug("tensor_storage_index_copy_for_zzekmr_row(destination=0x%x, source=0x%x, nnz=%d)\n", d, s, nnz); - - for (i = 0; i < nnz; ++i) { - d->CK[i] = s->tuples[i].j * g_r + s->tuples[i].k; - } -} - -void -tensor_storage_convert_from_coordinate_to_zzekmr(tensor_t *destination, tensor_t *source) -{ - uint i, n, nnz; - tensor_storage_base_t *base; - tensor_storage_extended_t *d; - tensor_storage_coordinate_t *s; - coordinate_tuple_t *tuples; - double *values; - - s = STORAGE_COORIDINATE(source); - d = STORAGE_EXTENDED(destination); - - debug("tensor_storage_convert_from_coordinate_to_zzekmr(destination=0x%x, source=0x%x)\n", d, s); - - base = STORAGE_BASE(destination); - nnz = source->nnz; - n = source->n; - values = source->values; - g_r = d->rn; - tuples = s->tuples; - - qsort(tuples, nnz, sizeof(coordinate_tuple_t), base->callbacks->index_compare); - d->rn = tensor_storage_index_encode(d->RO, n, tuples, nnz, base->callbacks->index_r_encoder); - (*base->callbacks->index_copy)(d, s, nnz); - - for (i = 0; i < nnz; ++i) { - destination->values[i] = values[tuples[i].index]; - } -} - -tensor_storage_extended_t* -tensor_storage_malloc_zzekmr(tensor_t const *tensor) -{ - tensor_storage_base_t *base; - tensor_storage_extended_t *storage; - conversion_callbacks_t *callbacks; - - superfluous("tensor_storage_malloc_zzekmr(tensor=0x%x)\n", tensor); - - storage = MALLOC(tensor_storage_extended_t); - storage->CK = MALLOC_N(uint, tensor->nnz); - storage->RO = NULL; - storage->rn = 0; - - callbacks = MALLOC(conversion_callbacks_t); - callbacks->index_compare = NULL; - callbacks->index_r_encoder = NULL; - callbacks->index_copy = NULL; - - switch (tensor->orientation) { - case orientation::row: - storage->rn = tensor->n; - callbacks->index_compare = &tensor_storage_index_compare_for_zzekmr_row; - callbacks->index_r_encoder = &encoder_for_i; - callbacks->index_copy = &tensor_storage_index_copy_for_zzekmr_row; - break; - default: - die("Tensor orientation '%s' not yet supported.\n", orientation_to_string(tensor->orientation)); - break; - } - - storage->rn += 1; - storage->RO = MALLOC_N(uint, storage->rn); - base = (tensor_storage_base_t*) storage; - base->callbacks = callbacks; - - superfluous("tensor_storage_malloc_zzekmr: callbacks=0x%x\n", callbacks); - superfluous("tensor_storage_malloc_zzekmr: storage->CK=0x%x\n", storage->CK); - superfluous("tensor_storage_malloc_zzekmr: storage->size (of R)=%d\n", storage->rn); - superfluous("tensor_storage_malloc_zzekmr: storage->RO=0x%x\n", storage->RO); - superfluous("tensor_storage_malloc_zzekmr: storage=0x%x\n", storage); - - return storage; -} diff --git a/src/tool_convert.cc b/src/tool_convert.cc deleted file mode 100644 index 592fcd7..0000000 --- a/src/tool_convert.cc +++ /dev/null @@ -1,169 +0,0 @@ - -#include "cache.h" -#include "compatible.h" -#include "error.h" -#include "file.h" -#include "matrix.h" -#include "operation.h" -#include "tensor.h" -#include "tool.h" -#include "utility.h" -#include "vector.h" -#include -#include -#include -#include -#include -#include -#include - -extern cache_t *cache; -extern uint cache_size; -extern uint cache_line_size; -extern bool emit_latex; -extern uint iterations; -extern char *tool_name; -extern tool::type_t tool_type; -extern bool simulate; -extern bool verbose; -extern verbosity::type_t noisiness; -extern bool write_results; - -void -convert_tool_usage() -{ - print_tool_banner(); - message("\nUsage:\n"); - message("\t%s [options] [output]\n", tool_name); - message("\nOptions:\n"); - message("\t-h\tthis screen\n"); - message("\t-l\temit LaTeX code as output (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_EMIT_LATEX)); - message("\t-s\tstrategy (default: %s)\n", strategy_to_string(DEFAULT_STRATEGY)); - print_strategies("\t\t- %s\n"); - message("\t-o\torientation (default: %s)\n", orientation_to_string(DEFAULT_ORIENTATION)); - print_orientations("\t\t- %s\n"); - message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE)); - message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max); - message("\nExample:\n\n"); - message("\t$ ./tensor %s -s compressed -o column ieee-fig4.in tensor.out\n", tool_name); - message("\tReading ieee-fig4.in ... done [0.000305]\n"); - message("\tConverting from 'coordinate' to 'compressed-column' ... done [0.000010]\n"); - message("\tWriting tensor.out ... done [0.000031]\n"); - exit(1); -} - -tensor_t* -timed_tensor_convert(tensor_t *source, strategy::type_t strategy, orientation::type_t orientation) -{ - precision_timer_t t; - tensor_t *tensor; - - progress("Converting from '%s' to '%s-%s' ... ", - strategy_to_string(source->strategy), - strategy_to_string(strategy), - orientation_to_string(orientation)); - timer_start(&t); - tensor = tensor_convert(source, strategy, orientation); - timer_end(&t); - print_elapsed_time(t); - - return tensor; -} - -void -convert_tool_main(int argc, char *argv[]) -{ - int c, offset; - char *name; - tensor_t *tensor, *result; - strategy::type_t strategy; - orientation::type_t orientation; - - /* just to be safe, set the tensors to null */ - tensor = result = NULL; - - /* set the program's defaults */ - orientation = DEFAULT_ORIENTATION; - strategy = DEFAULT_STRATEGY; - - /* we will privide our own error messages */ - opterr = 0; - - /* extract any command-line options the user provided */ - while (-1 != (c = getopt(argc, argv, ":hlo:s:vV:"))) { - switch (c) { - case 'h': - convert_tool_usage(); - break; - case 'l': - emit_latex = !emit_latex; - break; - case 'o': - if (isdigit(optarg[0])) { - orientation = (orientation::type_t) atoi(optarg); - } else { - orientation = string_to_orientation(optarg); - } - break; - case 's': - if (isdigit(optarg[0])) { - strategy = (strategy::type_t) atoi(optarg); - } else { - strategy = string_to_strategy(optarg); - } - break; - case 'v': - verbose = !verbose; - break; - case 'V': - noisiness = (verbosity::type_t) atoi(optarg); - if (0 == noisiness) { - noisiness = DEFAULT_VERBOSITY; - } - break; - case ':': - die("Option -%c requires an operand; that is, an integer or string value.\n", optopt); - break; - case '?': - die("Unknown option: `-%c'\n", optopt); - break; - default: - abort(); - break; - } - } - - if (noisiness > DEFAULT_VERBOSITY) { - verbose = true; - } - - /* count the number of remaining arguments */ - if (argc-optind < 1) { - convert_tool_usage(); - } - - /* print program options, for debugging purposes */ - print_tool_options(); - debug("convert_tool_main: orientation='%s'\n", orientation_to_string(orientation)); - debug("convert_tool_main: strategy='%s'\n", strategy_to_string(strategy)); - - /* parse the remaining command line options */ - offset = optind; - name = argv[offset++]; - tensor = timed_tensor_read(name); - debug("main: tensor=0x%x\n", tensor); - - if (strategy == tensor->strategy) { - /* we'll deal with differing orientation when it comes up */ - result = tensor; - tensor = NULL; - } else { - result = timed_tensor_convert(tensor, strategy, orientation); - } - debug("main: result=0x%x\n", result); - - timed_tensor_write(argc, argv, offset, result); - - tensor_free(result); - tensor_free(tensor); -} diff --git a/src/tool_permute.cc b/src/tool_permute.cc deleted file mode 100644 index b3d348b..0000000 --- a/src/tool_permute.cc +++ /dev/null @@ -1,171 +0,0 @@ - -#include "cache.h" -#include "compatible.h" -#include "error.h" -#include "file.h" -#include "matrix.h" -#include "memory.h" -#include "operation.h" -#include "tensor.h" -#include "tool.h" -#include "utility.h" -#include "vector.h" -#include -#include -#include -#include -#include "timer.h" -#include -#include - -extern bool human_readable; -extern char *tool_name; -extern tool::type_t tool_type; -extern bool tracing; -extern bool verbose; -extern verbosity::type_t noisiness; -extern bool write_results; - -static permutation_heuristic::type_t heuristic; - -void -permute_tool_usage() -{ - print_tool_banner(); - message("\nUsage:\n"); - message("\t%s [options] ... [output]\n", tool_name); - message("\nOptions:\n"); - message("\t-h\tthis screen\n"); - message("\t-p\tpermutation heuristic (default: %s)\n", permutation_heuristic_to_string(DEFAULT_PERMUTATION_HEURISTIC)); - print_permutation_heuristics_with_descriptions("\t\t- %s : %s\n"); - message("\t-t\ttoggle tracing (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_TRACING)); - message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE)); - message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max); - message("\t-w\twrite results (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_WRITE_RESULTS)); - message("\nExample:\n\n"); - message("\t$ ./tensor %s -p naive-minimum tensor.in tensor.out\n", tool_name); - message("\tReading vector.in ... done [0.000305]\n"); - message("\tReading tensor.in ... done [0.000235]\n"); - message("\tConverting from 'coordinate' to 'compressed-column' ... done [0.000010]\n"); - message("\tWriting matrix.out ... done [0.000031]\n"); - exit(1); -} - -tensor_t* -timed_tensor_permute(tensor_t *tensor) -{ - precision_timer_t t; - tensor_t *permuted; - - progress("Permuting tensor using the '%s' heuristic ... ", - permutation_heuristic_to_string(heuristic)); - timer_start(&t); - permuted = tensor_permute(tensor, heuristic); - timer_end(&t); - print_elapsed_time(t); - - return permuted; -} - -void -timed_permutation(int argc, char *argv[]) -{ - int offset; - char *name; - tensor_t *tensor, *permuted; - - offset = optind; - name = argv[offset++]; - tensor = timed_tensor_read(name); - debug("timed_permutation: tensor=0x%x\n", tensor); - - if (permutation_heuristic::none == heuristic) { - print_elapsed_time(0.0); /* just a no-op */ - } else { - permuted = timed_tensor_permute(tensor); - tensor_free(tensor); - tensor = permuted; - } - debug("timed_permutation: permutation=0x%x\n", tensor); - - /* if we are not printing times for each procedure out in a human - consumable way, then we need to terminate the line containing all - the timings for this instance */ - if (!human_readable) { - message("\n"); - } - - timed_tensor_write(argc, argv, offset, tensor); - tensor_free(tensor); -} - -void -permute_tool_main(int argc, char *argv[]) -{ - int c; - - /* set the program's defaults */ - heuristic = DEFAULT_PERMUTATION_HEURISTIC; - - /* we will privide our own error messages */ - opterr = 0; - - /* extract any command-line options the user provided */ - while (-1 != (c = getopt(argc, argv, ":hp:tuvV:w"))) { - switch (c) { - case 'h': - permute_tool_usage(); - break; - case 'p': - if (isdigit(optarg[0])) { - heuristic = (permutation_heuristic::type_t) atoi(optarg); - } else { - heuristic = string_to_permutation_heuristic(optarg); - } - break; - case 't': - tracing = !tracing; - break; - case 'u': - human_readable = !human_readable; - break; - case 'v': - verbose = !verbose; - break; - case 'V': - noisiness = (verbosity::type_t) atoi(optarg); - if (0 == noisiness) { - noisiness = DEFAULT_VERBOSITY; - } - break; - case 'w': - write_results = !write_results; - break; - case ':': - die("Option -%c requires an operand; that is, an integer or string value.\n", optopt); - break; - case '?': - die("Unknown option: `-%c'\n", optopt); - break; - default: - abort(); - break; - } - } - - if (noisiness > DEFAULT_VERBOSITY) { - verbose = true; - } - - /* count the number of remaining arguments */ - if (argc-optind < 1) { - permute_tool_usage(); - } - - /* print program options, for debugging purposes */ - print_tool_options(); - debug("permute_tool_main: heuristic='%s'\n", permutation_heuristic_to_string(heuristic)); - - /* pass control over to some naive timing procedures */ - timed_permutation(argc, argv); -} From a372446b92806ce62f68c963464848204421e98e Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Wed, 9 Nov 2011 13:56:32 -0700 Subject: [PATCH 28/57] + Removed uneeded tools --- src/tool_utility.cc | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/tool_utility.cc b/src/tool_utility.cc index 7994529..895b451 100644 --- a/src/tool_utility.cc +++ b/src/tool_utility.cc @@ -18,10 +18,8 @@ extern verbosity::type_t noisiness; extern bool write_results; #define TENSOR_DESCRIPTION "A tool for working with tensors." -#define CONVERT_DESCRIPTION "A tool for converting between th-order tensor storage strategies." #define GENERATE_DESCRIPTION "A tool for generating varieties of th-order tensors." #define EFFECTUATE_DESCRIPTION "A tool for performing computations on th-order tensors." -#define PERMUTE_DESCRIPTION "A tool for permuting the non-zeros of th-order tensors." #define VERSION "Version 0.01 (" __DATE__ "), " \ "Copyright (C) 2011, and GPLv3'd, by Ben Burnett\n" \ "This is free software; see the source for copying conditions. There is NO\n" \ @@ -32,19 +30,15 @@ extern bool write_results; static char const *map_tool_to_string[] = { "unknown", "tensor", - "convert", "generate", "effectuate", - "permute" }; static char const *map_tools_to_description[] = { "unknown", TENSOR_DESCRIPTION, - CONVERT_DESCRIPTION, GENERATE_DESCRIPTION, EFFECTUATE_DESCRIPTION, - PERMUTE_DESCRIPTION }; char const* string_from_tool(tool::type_t tool) From a6392d3fd45cc3d3edd79ff777d56c72a039b1b2 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Wed, 9 Nov 2011 13:57:53 -0700 Subject: [PATCH 29/57] + Added command line support to specify work partitioning scheme for threads --- src/main.cc | 34 +++++++------ src/operation_n_mode_product.cc | 88 +++++++++++++++++++++++++++++---- src/thread.cc | 28 ++++++++++- src/thread.h | 22 +++++++++ src/tool.h | 2 +- src/tool_effectuate.cc | 46 ++++++++++------- 6 files changed, 174 insertions(+), 46 deletions(-) diff --git a/src/main.cc b/src/main.cc index b84b7bb..922276c 100644 --- a/src/main.cc +++ b/src/main.cc @@ -7,6 +7,7 @@ #include "operation.h" #include "strings.h" #include "tensor.h" +#include "thread.h" #include "tool.h" #include "utility.h" #include "vector.h" @@ -17,22 +18,23 @@ #include #include -cache_t *cache; -uint cache_size; -uint cache_line_size; -uint iterations; -uint memory_stride; -uint seed; -uint thread_count; -char *tool_name; -tool::type_t tool_type; -bool tracing; -bool simulate; -bool human_readable; -bool verbose; -verbosity::type_t noisiness; -bool write_results; -bool emit_latex; +cache_t *cache; +uint cache_size; +uint cache_line_size; +uint iterations; +uint memory_stride; +thread::partition::type_t thread_partition; +uint seed; +uint thread_count; +char *tool_name; +tool::type_t tool_type; +bool tracing; +bool simulate; +bool human_readable; +bool verbose; +verbosity::type_t noisiness; +bool write_results; +bool emit_latex; void usage() diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index c5b1e57..3760bdd 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -11,11 +11,12 @@ #include #include -extern cache_t *cache; -extern uint memory_stride; -extern uint thread_count; +extern cache_t *cache; +extern uint memory_stride; +extern uint thread_count; +extern thread::partition::type_t thread_partition; -static pthread_mutex_t tube_lock; +static pthread_mutex_t tube_lock; /* Computing ($pT$): @@ -39,7 +40,7 @@ typedef struct { } product_thread_data_t; int -traditional_next_tube(product_thread_data_t *data) +tube_next(product_thread_data_t *data) { uint k; @@ -50,7 +51,7 @@ traditional_next_tube(product_thread_data_t *data) } thread_address_t -traditional_fiber_product(thread_argument_t *argument) +tube_product(thread_argument_t *argument) { int t; uint i, j, k, offset; @@ -66,7 +67,7 @@ traditional_fiber_product(thread_argument_t *argument) P = data->vector->data; T = data->tensor->values; - while (-1 != (t = traditional_next_tube(data))) { + while (-1 != (t = tube_next(data))) { sum = 0; offset = t*n; i = t/n; @@ -80,8 +81,54 @@ traditional_fiber_product(thread_argument_t *argument) return NULL; } +int +slice_next(product_thread_data_t *data) +{ + uint k; + + thread_mutex_lock(&tube_lock); + k = data->done++; + thread_mutex_unlock(&tube_lock); + return k < (data->tensor->n) ? k : -1; +} + +thread_address_t +slice_product(thread_argument_t *argument) +{ + int i; + uint j, k; + uint ioffset, joffset; + uint n, sum[1000]; + uint *P; + double **M, *T; + product_thread_data_t *data; + + data = (product_thread_data_t*) thread_data(argument); + + n = data->tensor->n; + M = data->matrix->data; + P = data->vector->data; + T = data->tensor->values; + + while (-1 != (i = slice_next(data))) { + ioffset = i*n*n; + for (j = 0; j < n; ++j) { + sum[j] = 0; + joffset = ioffset+j*n; + for (k = 0; k < n; ++k) { + sum[j] += P[k] * T[joffset+k]; + } + } + for (j = 0; j < n; ++j) { + M[i][j] = sum[j]; + } + } + + return NULL; +} + void -threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor, thread_function_t function) { product_thread_data_t data; @@ -95,11 +142,32 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t data.matrix = matrix; data.vector = vector; data.tensor = tensor; - + thread_mutex_init(&tube_lock); - thread_fork(thread_count, traditional_fiber_product, &data, NULL); + thread_fork(thread_count, slice_product, &data, NULL); thread_mutex_destroy(&tube_lock); } + +void +threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +{ + thread_function_t function; + + switch (thread_partition) { + case thread::partition::tube: + function = (thread_function_t) &tube_product; + break; + case thread::partition::slice: + function = (thread_function_t) &slice_product; + break; + default: + die("serial_n_mode_product: tensor product for '%s' strategy is not currently supported.\n", + strategy_to_string(tensor->strategy)); + break; + } + + threaded_n_mode_product_array(matrix, vector, tensor, function); +} void serial_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) diff --git a/src/thread.cc b/src/thread.cc index eec77dd..18f8ca9 100644 --- a/src/thread.cc +++ b/src/thread.cc @@ -5,11 +5,37 @@ */ #include "thread.h" - +#include "utility.h" #include #include #include /* for EBUSY */ +static char const *map_thread_partition_to_string[] = { + "unknown", + "tube", + "slice" +}; + +char const* +thread_partition_to_string(thread::partition::type_t partition) +{ + return map_thread_partition_to_string[partition]; +} + +thread::partition::type_t +string_to_thread_partition(char const *name) +{ + uint i; + + for (i = 0; i < COUNT_OF(map_thread_partition_to_string); ++i) { + if (0 == strcmp(name, map_thread_partition_to_string[i])) { + return (thread::partition::type_t) i; + } + } + + return thread::partition::unknown; +} + /************************************************* * attempt to lock a mutex */ diff --git a/src/thread.h b/src/thread.h index 8099434..3d418a6 100644 --- a/src/thread.h +++ b/src/thread.h @@ -8,6 +8,28 @@ #ifndef _THREAD_H_ #define _THREAD_H_ +namespace thread { + + namespace model { + typedef enum { + unknown, + traditional + } type_t; + } + + namespace partition { + typedef enum { + unknown, + tube, + slice + } type_t; + } + +} + +char const* thread_partition_to_string(thread::partition::type_t partition); +thread::partition::type_t string_to_thread_partition(char const *name); + /* Linux defs: * _REENTRANT to get thread-safe libs * _POSIX_SOURCE to get POSIX semantics diff --git a/src/tool.h b/src/tool.h index a40c1cf..ba641a8 100644 --- a/src/tool.h +++ b/src/tool.h @@ -25,11 +25,11 @@ namespace tool { #define DEFAULT_MEMORY_STRIDE 32 #define DEFAULT_OPERATION operation::n_mode_product #define DEFAULT_ORIENTATION orientation::row -#define DEFAULT_PERMUTATION_HEURISTIC permutation_heuristic::none #define DEFAULT_SIMULATE false #define DEFAULT_STRATEGY strategy::compressed #define DEFAULT_TRACING false #define DEFAULT_THREAD_COUNT 1 +#define DEFAULT_THREAD_PARTITION thread::partition::tube #define DEFAULT_VERBOSE false #define DEFAULT_VERBOSITY verbosity::low #define DEFAULT_WRITE_RESULTS false diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc index 314d116..9119197 100644 --- a/src/tool_effectuate.cc +++ b/src/tool_effectuate.cc @@ -7,6 +7,7 @@ #include "memory.h" #include "operation.h" #include "tensor.h" +#include "thread.h" #include "tool.h" #include "utility.h" #include "vector.h" @@ -18,20 +19,21 @@ #include #include -extern cache_t *cache; -extern uint cache_size; -extern uint cache_line_size; -extern bool human_readable; -extern uint iterations; -extern uint memory_stride; -extern uint thread_count; -extern char *tool_name; -extern tool::type_t tool_type; -extern bool simulate; -extern bool tracing; -extern bool verbose; -extern verbosity::type_t noisiness; -extern bool write_results; +extern cache_t *cache; +extern uint cache_size; +extern uint cache_line_size; +extern bool human_readable; +extern uint iterations; +extern uint memory_stride; +extern uint thread_count; +extern thread::partition::type_t thread_partition; +extern char *tool_name; +extern tool::type_t tool_type; +extern bool simulate; +extern bool tracing; +extern bool verbose; +extern verbosity::type_t noisiness; +extern bool write_results; static operation::type_t optcode; @@ -180,10 +182,10 @@ effectuate_tool_main(int argc, char *argv[]) int c; /* set the program's defaults */ - memory_stride = DEFAULT_MEMORY_STRIDE; - optcode = DEFAULT_OPERATION; - thread_count = DEFAULT_THREAD_COUNT; - + memory_stride = DEFAULT_MEMORY_STRIDE; + optcode = DEFAULT_OPERATION; + thread_count = DEFAULT_THREAD_COUNT; + thread_partition = DEFAULT_THREAD_PARTITION; /* we will privide our own error messages */ opterr = 0; @@ -219,6 +221,13 @@ effectuate_tool_main(int argc, char *argv[]) optcode = string_to_operation(optarg); } break; + case 'p': + if (isdigit(optarg[0])) { + thread_partition = (thread::partition::type_t) atoi(optarg); + } else { + thread_partition = string_to_thread_partition(optarg); + } + break; case 'r': memory_stride = atoi(optarg); if (0 == memory_stride) { @@ -278,6 +287,7 @@ effectuate_tool_main(int argc, char *argv[]) debug("effectuate_tool_main: operation='%s'\n", operation_to_string(optcode)); debug("effectuate_tool_main: memory_stride=%d\n", memory_stride); debug("effectuate_tool_main: thread_count=%d\n", thread_count); + debug("effectuate_tool_main: thread_partition='%s'\n", thread_partition_to_string(thread_partition)); /* if we are just running a simulation, then we only do one iteration; otherwise, it would be really slow */ From 83344743ba85a312461cef89bfb17e8a29ea84a0 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Thu, 10 Nov 2011 20:47:26 -0700 Subject: [PATCH 30/57] + Using atomic increment instead of a mutex locked int + Configured the thread_fork call to allow for cpu affinity --- src/operation_n_mode_product.cc | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index 3760bdd..0bbedc5 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -16,8 +16,6 @@ extern uint memory_stride; extern uint thread_count; extern thread::partition::type_t thread_partition; -static pthread_mutex_t tube_lock; - /* Computing ($pT$): Let $\T \in R^{n\times n\times n}$ be a tensor. @@ -42,11 +40,11 @@ typedef struct { int tube_next(product_thread_data_t *data) { - uint k; + volatile uint k; - thread_mutex_lock(&tube_lock); - k = data->done++; - thread_mutex_unlock(&tube_lock); + /* rather than a lock we can take advantage of the architecture and + issue an atomic fetch and increment */ + k = __sync_fetch_and_add(&data->done, 1); return k < (data->tensor->n*data->tensor->n) ? k : -1; } @@ -55,9 +53,10 @@ tube_product(thread_argument_t *argument) { int t; uint i, j, k, offset; - uint n, sum; + uint n; uint *P; double **M, *T; + double sum; product_thread_data_t *data; data = (product_thread_data_t*) thread_data(argument); @@ -84,12 +83,12 @@ tube_product(thread_argument_t *argument) int slice_next(product_thread_data_t *data) { - uint k; + volatile uint k; - thread_mutex_lock(&tube_lock); - k = data->done++; - thread_mutex_unlock(&tube_lock); - return k < (data->tensor->n) ? k : -1; + /* rather than a lock we can take advantage of the architecture and + issue an atomic fetch and increment */ + k = __sync_fetch_and_add(&data->done, 1); + return k < data->tensor->n ? k : -1; } thread_address_t @@ -98,9 +97,10 @@ slice_product(thread_argument_t *argument) int i; uint j, k; uint ioffset, joffset; - uint n, sum[1000]; + uint n; uint *P; double **M, *T; + double sum[1000]; product_thread_data_t *data; data = (product_thread_data_t*) thread_data(argument); @@ -143,9 +143,7 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t data.vector = vector; data.tensor = tensor; - thread_mutex_init(&tube_lock); - thread_fork(thread_count, slice_product, &data, NULL); - thread_mutex_destroy(&tube_lock); + thread_afork(thread_count, slice_product, &data, NULL); } void From 0fe863469e0db9cba608cf7e0481e16dd227fd51 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Thu, 10 Nov 2011 20:47:57 -0700 Subject: [PATCH 31/57] + Added cpu affinity configuration for the thread_fork procedure --- src/thread.cc | 26 ++++++++++++++++++++++---- src/thread.h | 25 +++++++++++++++++++++---- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/src/thread.cc b/src/thread.cc index 18f8ca9..c25ef50 100644 --- a/src/thread.cc +++ b/src/thread.cc @@ -10,6 +10,8 @@ #include #include /* for EBUSY */ +extern uint thread_count; + static char const *map_thread_partition_to_string[] = { "unknown", "tube", @@ -72,12 +74,15 @@ thread_wait(pthread_t *thread, thread_address_t exitcode) * run nthreads threads in the routine start */ void _thread_fork(int nthreads, - thread_function_t start, - thread_address_t arg, - thread_address_t *exitcodes) + thread_function_t start, + thread_address_t arg, + thread_address_t *exitcodes, + int setaffinity) { int i; thread_argument_t *args; + pthread_attr_t attr, *pattr; + cpu_set_t mask; thread_address_t *address; if (nthreads<1) { @@ -89,9 +94,22 @@ void _thread_fork(int nthreads, for (i=0; i Date: Fri, 11 Nov 2011 10:16:10 -0700 Subject: [PATCH 32/57] + Darwin does not support the cpu_set_t structure; there is probably an equivalent one that just needs to be found --- src/thread.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/thread.cc b/src/thread.cc index c25ef50..e968b14 100644 --- a/src/thread.cc +++ b/src/thread.cc @@ -10,8 +10,6 @@ #include #include /* for EBUSY */ -extern uint thread_count; - static char const *map_thread_partition_to_string[] = { "unknown", "tube", @@ -81,9 +79,11 @@ void _thread_fork(int nthreads, { int i; thread_argument_t *args; + thread_address_t *address; pthread_attr_t attr, *pattr; +#ifdef __linux__ cpu_set_t mask; - thread_address_t *address; +#endif if (nthreads<1) { die("thread_mutex_trylock: nthreads<1\n"); @@ -100,14 +100,16 @@ void _thread_fork(int nthreads, #if 0 /* for this to work correctly, we need to detect the number of CPUs */ +#ifdef __linux__ if (setaffinity) { CPU_ZERO(&mask); CPU_SET(i%thread_count,&mask); pthread_attr_setaffinity_np(&attr,sizeof(mask),&mask); pattr = &attr; - } #endif + } thread_create_with_attr(&args[i].self,pattr,start,args+i); +#endif } pthread_attr_destroy(&attr); for (i=0; i Date: Fri, 11 Nov 2011 10:17:43 -0700 Subject: [PATCH 33/57] + Changed thread linking linker line to be cross-platform --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index d59896a..15ecac1 100644 --- a/src/Makefile +++ b/src/Makefile @@ -16,7 +16,7 @@ ifndef SIMULATE EXTRA_DEBUG += -DNOSIMULATE endif EXTRA_CXXFLAGS=-c $(EXTRA_DEBUG) $(STRICT) $(INCLUDES) $(CPPX11) -EXTRA_LDFLAGS=-Wall -lpthread $(EXTRA_DEBUG) +EXTRA_LDFLAGS=-Wall -thread $(EXTRA_DEBUG) HEADERS_CACHE=address.h cache.h hash.h HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h \ From 2205a00847244e1efba5bfecde080d9f7a7b3bbe Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Fri, 11 Nov 2011 16:21:39 -0700 Subject: [PATCH 34/57] + Returned the vector object back to using doubles vs uints --- src/vector.h | 2 +- src/vector_malloc.cc | 2 +- src/vector_read.cc | 2 +- src/vector_write.cc | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vector.h b/src/vector.h index 22b71ac..709aff5 100644 --- a/src/vector.h +++ b/src/vector.h @@ -8,7 +8,7 @@ typedef struct { uint n; ownership::type_t owner; - uint *data; + double *data; } vector_t; vector_t* vector_malloc(uint n, ownership::type_t owner = ownership::creator); diff --git a/src/vector_malloc.cc b/src/vector_malloc.cc index 7b8daef..afa7d4b 100644 --- a/src/vector_malloc.cc +++ b/src/vector_malloc.cc @@ -19,7 +19,7 @@ vector_malloc(uint n, ownership::type_t owner) return v; } - v->data = MALLOC_N(uint, n); + v->data = MALLOC_N(double, n); return v; } diff --git a/src/vector_read.cc b/src/vector_read.cc index e8d8bb0..d6c377e 100644 --- a/src/vector_read.cc +++ b/src/vector_read.cc @@ -20,7 +20,7 @@ vector_read_array(FILE *f) v = vector_malloc(n); for (i = 0; i < v->n; ++i) { - fscanf(f, "%u\n", &v->data[i]); + fscanf(f, "%lg\n", &v->data[i]); } return v; diff --git a/src/vector_write.cc b/src/vector_write.cc index 5907f4c..03b26b2 100644 --- a/src/vector_write.cc +++ b/src/vector_write.cc @@ -33,7 +33,7 @@ vector_write_array(FILE *f, vector_t const *v) } for (i = 0; i < v->n; ++i) { - fprintf(f, "%d\n", v->data[i]); + fprintf(f, "%10.6g\n", v->data[i]); } } From c9ec3f3fa777f8fe31f6edcedb316f1dab77054d Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Fri, 11 Nov 2011 16:22:40 -0700 Subject: [PATCH 35/57] + Moved all calculation code in to a seperate set of source files --- src/operation.cc | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ src/operation.h | 3 ++ 2 files changed, 83 insertions(+) create mode 100644 src/operation.cc diff --git a/src/operation.cc b/src/operation.cc new file mode 100644 index 0000000..c6a8fd5 --- /dev/null +++ b/src/operation.cc @@ -0,0 +1,80 @@ + +#include "cache.h" +#include "compatible.h" +#include "error.h" +#include "matrix.h" +#include "operation.h" +#include "thread.h" +#include "tensor.h" +#include "utility.h" +#include "vector.h" +#include +#include + +extern uint thread_count; + +void +threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +{ + debug("threaded_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + + compatible(vector, tensor); + + switch (tensor->strategy) { + case strategy::array: + threaded_n_mode_product_array(matrix, vector, tensor); + break; + default: + die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n", + strategy_to_string(tensor->strategy)); + break; + } +} + +void +serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +{ + debug("serial_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + + compatible(vector, tensor); + + switch (tensor->strategy) { + case strategy::array: + /* in this case, we want to compare the single thread version of + the same algo against the n-threaded version */ + threaded_n_mode_product_array(matrix, vector, tensor); + break; + default: + die("serial_n_mode_product: tensor product for '%s' strategy is not currently supported.\n", + strategy_to_string(tensor->strategy)); + break; + } +} + +void +operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) +{ + debug("operation_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); + + if (thread_count <= 1) { + serial_n_mode_product(matrix, vector, tensor); + } else { + threaded_n_mode_product(matrix, vector, tensor); + } +} + +matrix_t* +operation_n_mode_product(vector_t const *vector, tensor_t const *tensor) +{ + matrix_t *matrix; + + compatible(vector, tensor); + debug("operation_n_mode_product(vector=0x%x, tensor=0x%x)\n", vector, tensor); + + matrix = matrix_malloc(tensor->m, tensor->n, ownership::creator); + debug("operation_n_mode_product: matrix=0x%x\n", matrix); + + operation_n_mode_product(matrix, vector, tensor); + + return matrix; +} diff --git a/src/operation.h b/src/operation.h index 0542523..d8f6bce 100644 --- a/src/operation.h +++ b/src/operation.h @@ -20,6 +20,9 @@ operation::type_t string_to_operation(char const *name); void operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor); matrix_t *operation_n_mode_product(vector_t const *vector, tensor_t const *tensor); +void serial_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor); +void threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor); + #endif /* From b5f9a2f7e2f05301d8e1c1402677eb852625a93c Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Fri, 11 Nov 2011 16:23:40 -0700 Subject: [PATCH 36/57] + Darwin does not support the same affinity settings as Linux; I'll return to them when it seems nessesary --- src/thread.cc | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/thread.cc b/src/thread.cc index e968b14..f6bcd5d 100644 --- a/src/thread.cc +++ b/src/thread.cc @@ -71,6 +71,36 @@ thread_wait(pthread_t *thread, thread_address_t exitcode) /************************************************* * run nthreads threads in the routine start */ +void _thread_fork(int nthreads, + thread_function_t start, + thread_address_t arg, + thread_address_t *exitcodes, + int setaffinity) +{ + int i; + thread_argument_t *args; + thread_address_t *address; + + if (nthreads<1) { + die("thread_mutex_trylock: nthreads<1\n"); + } + if ((args=(thread_argument_t *) malloc(nthreads*sizeof(thread_argument_t)))==NULL) { + die("thread_fork: malloc failed!\n"); + } + for (i=0; i Date: Fri, 11 Nov 2011 16:24:27 -0700 Subject: [PATCH 37/57] + Added Darwin support for BLAS routines --- src/Makefile | 15 +++-- src/operation_n_mode_product.cc | 115 ++++++-------------------------- 2 files changed, 31 insertions(+), 99 deletions(-) diff --git a/src/Makefile b/src/Makefile index 15ecac1..9c27480 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,10 +1,11 @@ -CXX=g++ -SYMBOLS=echo OS=`uname` -ifeq ($(OS), Darwin) +ifeq "$(OS)" "Darwin" CXX=clang CPPX11=-std=c++0x -stdlib=libc++ SYMBOLS=dsymutil +else + CXX=g++ + SYMBOLS=echo endif INCLUDES=-I. STRICT=-pedantic -Wall -Wno-variadic-macros @@ -15,8 +16,10 @@ endif ifndef SIMULATE EXTRA_DEBUG += -DNOSIMULATE endif -EXTRA_CXXFLAGS=-c $(EXTRA_DEBUG) $(STRICT) $(INCLUDES) $(CPPX11) -EXTRA_LDFLAGS=-Wall -thread $(EXTRA_DEBUG) +EXTRA_CXXFLAGS=-c -DYA_BLAS -DYA_LAPACK -DYA_BLASMULT $(EXTRA_DEBUG) \ + $(STRICT) $(INCLUDES) $(CPPX11) +EXTRA_LDFLAGS=-Wall -thread -lblas -llapack -framework Accelerate \ + $(EXTRA_DEBUG) HEADERS_CACHE=address.h cache.h hash.h HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h \ @@ -31,7 +34,7 @@ HEADERS=$(HEADERS_CACHE) $(HEADERS_GENERAL) $(HEADERS_GENERATE) \ SOURCES_CACHE=address.cc cache.cc hash.cc SOURCES_GENERAL=arithmetic.cc compatible.cc error.cc file.cc \ - information.cc latex.cc memory.cc mmio.cc \ + information.cc latex.cc memory.cc mmio.cc operation.cc \ operation_n_mode_product.cc operation_utility.cc random.cc \ strings.cc thread.cc timer.cc tool_effectuate.cc \ tool_generate.cc tool_timing.cc tool_utility.cc types.cc \ diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index 0bbedc5..cdc009a 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -10,6 +10,11 @@ #include "vector.h" #include #include +#ifdef __APPLE__ +#include +#else +#include +#endif extern cache_t *cache; extern uint memory_stride; @@ -52,11 +57,9 @@ thread_address_t tube_product(thread_argument_t *argument) { int t; - uint i, j, k, offset; + uint i, j, offset; uint n; - uint *P; - double **M, *T; - double sum; + double **M, *T, *P; product_thread_data_t *data; data = (product_thread_data_t*) thread_data(argument); @@ -67,14 +70,10 @@ tube_product(thread_argument_t *argument) T = data->tensor->values; while (-1 != (t = tube_next(data))) { - sum = 0; - offset = t*n; - i = t/n; - j = t%n; - for (k = 0; k < n; ++k) { - sum += P[k] * T[offset+k]; - } - M[i][j] = sum; + offset = t*n; + i = t/n; + j = t%n; + M[i][j] = cblas_ddot(n, P, 1, T+offset, 1); } return NULL; @@ -95,12 +94,10 @@ thread_address_t slice_product(thread_argument_t *argument) { int i; - uint j, k; + uint j; uint ioffset, joffset; uint n; - uint *P; - double **M, *T; - double sum[1000]; + double **M, *T, *P; product_thread_data_t *data; data = (product_thread_data_t*) thread_data(argument); @@ -113,14 +110,8 @@ slice_product(thread_argument_t *argument) while (-1 != (i = slice_next(data))) { ioffset = i*n*n; for (j = 0; j < n; ++j) { - sum[j] = 0; joffset = ioffset+j*n; - for (k = 0; k < n; ++k) { - sum[j] += P[k] * T[joffset+k]; - } - } - for (j = 0; j < n; ++j) { - M[i][j] = sum[j]; + M[i][j] = cblas_ddot(n, P, 1, T+joffset, 1); } } @@ -142,8 +133,8 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t data.matrix = matrix; data.vector = vector; data.tensor = tensor; - - thread_afork(thread_count, slice_product, &data, NULL); + + thread_afork(thread_count, function, &data, NULL); } void @@ -170,10 +161,10 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t void serial_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { - uint i, j, k, index; + uint i, j, k; + uint index, sum; uint n; - uint *P; - double **M, *T; + double **M, *T, *P; debug("n_mode_product_array(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); @@ -184,74 +175,12 @@ serial_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t c for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { + sum = 0; for (k = 0; k < n; ++k) { index = tensor_index(tensor, i, j, k); - M[i][j] += P[k] * T[index]; + sum += P[k] * T[index]; } + M[i][j] = sum; } } } - -void -threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) -{ - debug("threaded_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); - - compatible(vector, tensor); - - switch (tensor->strategy) { - case strategy::array: - threaded_n_mode_product_array(matrix, vector, tensor); - break; - default: - die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n", - strategy_to_string(tensor->strategy)); - break; - } -} - -void -serial_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) -{ - debug("serial_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); - - compatible(vector, tensor); - - switch (tensor->strategy) { - case strategy::array: - serial_n_mode_product_array(matrix, vector, tensor); - break; - default: - die("serial_n_mode_product: tensor product for '%s' strategy is not currently supported.\n", - strategy_to_string(tensor->strategy)); - break; - } -} - -void -operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) -{ - debug("operation_n_mode_product(matrix=0x%x, vector=0x%x, tensor=0x%x)\n", matrix, vector, tensor); - - if (thread_count <= 1) { - serial_n_mode_product(matrix, vector, tensor); - } else { - threaded_n_mode_product(matrix, vector, tensor); - } -} - -matrix_t* -operation_n_mode_product(vector_t const *vector, tensor_t const *tensor) -{ - matrix_t *matrix; - - compatible(vector, tensor); - debug("operation_n_mode_product(vector=0x%x, tensor=0x%x)\n", vector, tensor); - - matrix = matrix_malloc(tensor->m, tensor->n, ownership::creator); - debug("operation_n_mode_product: matrix=0x%x\n", matrix); - - operation_n_mode_product(matrix, vector, tensor); - - return matrix; -} From c499f44d0f1c840e039ffde44c46aa9a1392b2cb Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Fri, 11 Nov 2011 18:26:56 -0700 Subject: [PATCH 38/57] + Split lin-alg routines off in to a new set of files, to test other libraries agains BLAS and hand-tuned code --- src/Makefile | 20 ++++++++++++-------- src/algebra.cc | 15 +++++++++++++++ src/algebra.h | 13 +++++++++++++ src/operation_n_mode_product.cc | 10 +++------- 4 files changed, 43 insertions(+), 15 deletions(-) create mode 100644 src/algebra.cc create mode 100644 src/algebra.h diff --git a/src/Makefile b/src/Makefile index 9c27480..d47ab7f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -9,6 +9,7 @@ else endif INCLUDES=-I. STRICT=-pedantic -Wall -Wno-variadic-macros + EXTRA_DEBUG=-g ifndef DEBUG EXTRA_DEBUG += -DNODEBUG @@ -16,15 +17,18 @@ endif ifndef SIMULATE EXTRA_DEBUG += -DNOSIMULATE endif + +EXTRA_LDFLAGS=-Wall -thread -lblas -llapack $(EXTRA_DEBUG) EXTRA_CXXFLAGS=-c -DYA_BLAS -DYA_LAPACK -DYA_BLASMULT $(EXTRA_DEBUG) \ - $(STRICT) $(INCLUDES) $(CPPX11) -EXTRA_LDFLAGS=-Wall -thread -lblas -llapack -framework Accelerate \ - $(EXTRA_DEBUG) + $(STRICT) $(INCLUDES) $(CPPX11) +ifeq "$(OS)" "Darwin" + EXTRA_LDFLAGS += -framework Accelerate +endif HEADERS_CACHE=address.h cache.h hash.h HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h \ - memory.h operation.h random.h thread.h strings.h timer.h \ - tool.h utility.h compatible.h + algebra.h memory.h operation.h random.h thread.h strings.h \ + timer.h tool.h utility.h compatible.h HEADERS_GENERATE=generate.h HEADERS_MATRIX=matrix.h mmio.h HEADERS_TENSOR=tensor.h @@ -34,9 +38,9 @@ HEADERS=$(HEADERS_CACHE) $(HEADERS_GENERAL) $(HEADERS_GENERATE) \ SOURCES_CACHE=address.cc cache.cc hash.cc SOURCES_GENERAL=arithmetic.cc compatible.cc error.cc file.cc \ - information.cc latex.cc memory.cc mmio.cc operation.cc \ - operation_n_mode_product.cc operation_utility.cc random.cc \ - strings.cc thread.cc timer.cc tool_effectuate.cc \ + information.cc latex.cc algebra.cc memory.cc mmio.cc \ + operation.cc operation_n_mode_product.cc operation_utility.cc \ + random.cc strings.cc thread.cc timer.cc tool_effectuate.cc \ tool_generate.cc tool_timing.cc tool_utility.cc types.cc \ utility.cc SOURCES_GENERATE=generate_tensor_from_matrix.cc diff --git a/src/algebra.cc b/src/algebra.cc new file mode 100644 index 0000000..d3a98d5 --- /dev/null +++ b/src/algebra.cc @@ -0,0 +1,15 @@ + +#include "algebra.h" +#include "error.h" + +#ifdef __APPLE__ +#include +#else +#include +#endif + +double +array_inner_product(const int N, const double *X, const int incX, const double *Y, const int incY) +{ + return cblas_ddot(N, X, incX, Y, incY); +} diff --git a/src/algebra.h b/src/algebra.h new file mode 100644 index 0000000..4a8e750 --- /dev/null +++ b/src/algebra.h @@ -0,0 +1,13 @@ + +#ifndef _ARRAY_MATH_H_ +#define _ARRAY_MATH_H_ + +double array_inner_product(const int N, const double *X, const int incX, const double *Y, const int incY); + +#endif + +/* + Local Variables: + mode: C++ + End: +*/ diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index cdc009a..8fbab24 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -1,4 +1,5 @@ +#include "algebra.h" #include "cache.h" #include "compatible.h" #include "error.h" @@ -10,11 +11,6 @@ #include "vector.h" #include #include -#ifdef __APPLE__ -#include -#else -#include -#endif extern cache_t *cache; extern uint memory_stride; @@ -73,7 +69,7 @@ tube_product(thread_argument_t *argument) offset = t*n; i = t/n; j = t%n; - M[i][j] = cblas_ddot(n, P, 1, T+offset, 1); + M[i][j] = array_inner_product(n, P, 1, T+offset, 1); } return NULL; @@ -111,7 +107,7 @@ slice_product(thread_argument_t *argument) ioffset = i*n*n; for (j = 0; j < n; ++j) { joffset = ioffset+j*n; - M[i][j] = cblas_ddot(n, P, 1, T+joffset, 1); + M[i][j] = array_inner_product(n, P, 1, T+joffset, 1); } } From 914cb463f760207846484dcaa766ffddbe000ac5 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 06:22:19 -0700 Subject: [PATCH 39/57] + Split support for Darwin and Linux --- src/algebra.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/algebra.cc b/src/algebra.cc index d3a98d5..242db80 100644 --- a/src/algebra.cc +++ b/src/algebra.cc @@ -4,7 +4,9 @@ #ifdef __APPLE__ #include -#else +#endif + +#ifdef __linux__ #include #endif From b22ae63a60c5929b11f34fb344831862e248047b Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 06:24:24 -0700 Subject: [PATCH 40/57] + Added left/right operand association split --- src/main.cc | 1 + src/operation_n_mode_product.cc | 103 +++++++++++++++++++++++--------- src/tool_effectuate.cc | 20 +++++-- 3 files changed, 92 insertions(+), 32 deletions(-) diff --git a/src/main.cc b/src/main.cc index 922276c..5e7dd16 100644 --- a/src/main.cc +++ b/src/main.cc @@ -18,6 +18,7 @@ #include #include +association::type_t operand_association; cache_t *cache; uint cache_size; uint cache_line_size; diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index 8fbab24..3d259e3 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -12,6 +12,7 @@ #include #include +extern association::type_t operand_association; extern cache_t *cache; extern uint memory_stride; extern uint thread_count; @@ -29,6 +30,18 @@ extern thread::partition::type_t thread_partition; end for end for end for + + Computing ($Tp$): + Let $\T \in R^{n\times n\times n}$ be a tensor. + Let $\M \in R^{n\times n}$ be a matrix. + Let $p \in R^{n}$ be a vector. + for i = 1 to l do + for j = 1 to m do + for k = 1 to m do + M[i][j] += T[j][i][k] * p[k] + end for + end for + end for */ typedef struct { @@ -49,21 +62,25 @@ tube_next(product_thread_data_t *data) return k < (data->tensor->n*data->tensor->n) ? k : -1; } -thread_address_t -tube_product(thread_argument_t *argument) +void +tube_product_pT(product_thread_data_t *data, uint n, double **M, double *P, double *T) { - int t; - uint i, j, offset; - uint n; - double **M, *T, *P; - product_thread_data_t *data; - - data = (product_thread_data_t*) thread_data(argument); + int t; + uint i, j, offset; - n = data->tensor->n; - M = data->matrix->data; - P = data->vector->data; - T = data->tensor->values; + while (-1 != (t = tube_next(data))) { + offset = t*n; + i = t/n; + j = t%n; + M[i][j] = array_inner_product(n, P, 1, T+offset, 1); + } +} + +void +tube_product_Tp(product_thread_data_t *data, uint n, double **M, double *P, double *T) +{ + int t; + uint i, j, offset; while (-1 != (t = tube_next(data))) { offset = t*n; @@ -71,6 +88,20 @@ tube_product(thread_argument_t *argument) j = t%n; M[i][j] = array_inner_product(n, P, 1, T+offset, 1); } +} + +thread_address_t +tube_product(thread_argument_t *argument) +{ + product_thread_data_t *data; + + data = (product_thread_data_t*) thread_data(argument); + + if (association::left == operand_association) { + tube_product_pT(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); + } else { + tube_product_Tp(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); + } return NULL; } @@ -86,22 +117,26 @@ slice_next(product_thread_data_t *data) return k < data->tensor->n ? k : -1; } -thread_address_t -slice_product(thread_argument_t *argument) +void +slice_product_pT(product_thread_data_t *data, uint n, double **M, double *P, double *T) { - int i; - uint j; - uint ioffset, joffset; - uint n; - double **M, *T, *P; - product_thread_data_t *data; + int i; + uint j, ioffset, joffset; - data = (product_thread_data_t*) thread_data(argument); - - n = data->tensor->n; - M = data->matrix->data; - P = data->vector->data; - T = data->tensor->values; + while (-1 != (i = slice_next(data))) { + ioffset = i*n*n; + for (j = 0; j < n; ++j) { + joffset = ioffset+j*n; + M[i][j] = array_inner_product(n, P, 1, T+joffset, 1); + } + } +} + +void +slice_product_Tp(product_thread_data_t *data, uint n, double **M, double *P, double *T) +{ + int i; + uint j, ioffset, joffset; while (-1 != (i = slice_next(data))) { ioffset = i*n*n; @@ -110,6 +145,20 @@ slice_product(thread_argument_t *argument) M[i][j] = array_inner_product(n, P, 1, T+joffset, 1); } } +} + +thread_address_t +slice_product(thread_argument_t *argument) +{ + product_thread_data_t *data; + + data = (product_thread_data_t*) thread_data(argument); + + if (association::left == operand_association) { + slice_product_pT(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); + } else { + slice_product_Tp(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); + } return NULL; } diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc index 9119197..6db2945 100644 --- a/src/tool_effectuate.cc +++ b/src/tool_effectuate.cc @@ -19,6 +19,7 @@ #include #include +extern association::type_t operand_association; extern cache_t *cache; extern uint cache_size; extern uint cache_line_size; @@ -182,17 +183,25 @@ effectuate_tool_main(int argc, char *argv[]) int c; /* set the program's defaults */ - memory_stride = DEFAULT_MEMORY_STRIDE; - optcode = DEFAULT_OPERATION; - thread_count = DEFAULT_THREAD_COUNT; - thread_partition = DEFAULT_THREAD_PARTITION; + operand_association = DEFAULT_ASSOCIATION; + memory_stride = DEFAULT_MEMORY_STRIDE; + optcode = DEFAULT_OPERATION; + thread_count = DEFAULT_THREAD_COUNT; + thread_partition = DEFAULT_THREAD_PARTITION; /* we will privide our own error messages */ opterr = 0; /* extract any command-line options the user provided */ - while (-1 != (c = getopt(argc, argv, ":hl:m:n:o:p:r:st:TuvV:w"))) { + while (-1 != (c = getopt(argc, argv, ":a:hl:m:n:o:p:r:st:TuvV:w"))) { switch (c) { + case 'a': + if (isdigit(optarg[0])) { + operand_association = (association::type_t) atoi(optarg); + } else { + operand_association = string_to_association(optarg); + } + break; case 'h': effectuate_tool_usage(); break; @@ -284,6 +293,7 @@ effectuate_tool_main(int argc, char *argv[]) /* print program options, for debugging purposes */ print_tool_options(); + debug("effectuate_tool_main: operand_association='%s'\n", association_to_string(operand_association)); debug("effectuate_tool_main: operation='%s'\n", operation_to_string(optcode)); debug("effectuate_tool_main: memory_stride=%d\n", memory_stride); debug("effectuate_tool_main: thread_count=%d\n", thread_count); From 08f113b2819f4e175d8f68755e0b95df1c2f1562 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 06:25:02 -0700 Subject: [PATCH 41/57] + Added Octave/Matlab output test --- src/matrix_write.cc | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/matrix_write.cc b/src/matrix_write.cc index d8cc7f2..f1b2fa1 100644 --- a/src/matrix_write.cc +++ b/src/matrix_write.cc @@ -16,6 +16,31 @@ matrix_initialize_type(MM_typecode *type) mm_set_real(type); } +#if 0 +/* Matlab/Octave format */ +void printmat(int N, int M, double *A, int LDA) +{ + int i, j; + double mtmp; + + printf("[ "); + for (i = 0; i < N; i++) { + printf("[ "); + for (j = 0; j < M; j++) { + mtmp = A[i + j * LDA]; + printf("%5.2e", mtmp); + if (j < M - 1) + printf(", "); + } + if (i < N - 1) + printf("]; "); + else + printf("] "); + } + printf("]"); +} +#endif + void matrix_fwrite_array(FILE *file, matrix_t const *matrix) { From afeb7df3b9c62262a1f1b10274f98a05917cf100 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 06:25:52 -0700 Subject: [PATCH 42/57] + Added default operand association --- src/tool.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tool.h b/src/tool.h index ba641a8..33eeb7c 100644 --- a/src/tool.h +++ b/src/tool.h @@ -20,6 +20,7 @@ namespace tool { #define OPTION_MESSAGE(x,a,b) (x ? a:b) #define DEFAULT_ON_OR_OFF(x) OPTION_MESSAGE(x, "on", "off") +#define DEFAULT_ASSOCIATION association::left #define DEFAULT_HUMAN_READABLE true #define DEFAULT_ITERATIONS 1 #define DEFAULT_MEMORY_STRIDE 32 From d8a49fe87d89d1a90c2210a9bbce8491425d1dbf Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 06:27:25 -0700 Subject: [PATCH 43/57] + Conversion between operand association string and enum --- src/operation.h | 10 ++++++++++ src/operation_utility.cc | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/src/operation.h b/src/operation.h index d8f6bce..8576ebb 100644 --- a/src/operation.h +++ b/src/operation.h @@ -13,8 +13,18 @@ namespace operation { } type_t; } +namespace association { + typedef enum { + unknown, + left, + right + } type_t; +} + +char const* association_to_string(association::type_t association); char const* operation_to_string(operation::type_t operation); char const* operation_to_description_string(operation::type_t operation); +association::type_t string_to_association(char const *name); operation::type_t string_to_operation(char const *name); void operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor); diff --git a/src/operation_utility.cc b/src/operation_utility.cc index e865afc..2c7dd4d 100644 --- a/src/operation_utility.cc +++ b/src/operation_utility.cc @@ -60,3 +60,29 @@ print_operations_with_descriptions(char const *format) message(format, map_operations_to_string[i], map_operations_to_description[i]); } } + +static char const *map_associations_to_string[] = { + "unknown", + "left", + "right" +}; + +char const* +association_to_string(association::type_t association) +{ + return map_associations_to_string[association]; +} + +association::type_t +string_to_association(char const *name) +{ + uint i; + + for (i = 0; i < COUNT_OF(map_associations_to_string); ++i) { + if (0 == strcmp(name, map_associations_to_string[i])) { + return (association::type_t) i; + } + } + + return association::unknown; +} From 06b41c89e8fcf5fd484b99a90581b358ec176bbe Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 10:58:30 -0700 Subject: [PATCH 44/57] + Added UI support for tensor storage orientation and storage strategy specification --- src/main.cc | 2 + src/thread.cc | 26 +++++++ src/thread.h | 2 + src/tool.h | 2 +- src/tool_convert.cc | 169 +++++++++++++++++++++++++++++++++++++++++ src/tool_effectuate.cc | 35 +++++++-- 6 files changed, 230 insertions(+), 6 deletions(-) create mode 100644 src/tool_convert.cc diff --git a/src/main.cc b/src/main.cc index 5e7dd16..9fc3efa 100644 --- a/src/main.cc +++ b/src/main.cc @@ -26,6 +26,8 @@ uint iterations; uint memory_stride; thread::partition::type_t thread_partition; uint seed; +orientation::type_t storage_orientation; +strategy::type_t storage_strategy; uint thread_count; char *tool_name; tool::type_t tool_type; diff --git a/src/thread.cc b/src/thread.cc index f6bcd5d..186e844 100644 --- a/src/thread.cc +++ b/src/thread.cc @@ -16,6 +16,12 @@ static char const *map_thread_partition_to_string[] = { "slice" }; +static char const *map_thread_partition_to_description[] = { + "unknown", + "tube per thread", + "slice per thread" +}; + char const* thread_partition_to_string(thread::partition::type_t partition) { @@ -36,6 +42,26 @@ string_to_thread_partition(char const *name) return thread::partition::unknown; } +void +print_thread_partitions(char const *format) +{ + uint i; + + for (i = 1; i < COUNT_OF(map_thread_partition_to_string); ++i) { + message(format, map_thread_partition_to_string[i]); + } +} + +void +print_thread_partitions_with_descriptions(char const *format) +{ + uint i; + + for (i = 1; i < COUNT_OF(map_thread_partition_to_string); ++i) { + message(format, map_thread_partition_to_string[i], map_thread_partition_to_description[i]); + } +} + /************************************************* * attempt to lock a mutex */ diff --git a/src/thread.h b/src/thread.h index 685cd9d..ace57b2 100644 --- a/src/thread.h +++ b/src/thread.h @@ -29,6 +29,8 @@ namespace thread { char const* thread_partition_to_string(thread::partition::type_t partition); thread::partition::type_t string_to_thread_partition(char const *name); +void print_thread_partitions(char const *format); +void print_thread_partitions_with_descriptions(char const *format); /* Linux defs: * _REENTRANT to get thread-safe libs diff --git a/src/tool.h b/src/tool.h index 33eeb7c..9946bbe 100644 --- a/src/tool.h +++ b/src/tool.h @@ -27,7 +27,7 @@ namespace tool { #define DEFAULT_OPERATION operation::n_mode_product #define DEFAULT_ORIENTATION orientation::row #define DEFAULT_SIMULATE false -#define DEFAULT_STRATEGY strategy::compressed +#define DEFAULT_STRATEGY strategy::array #define DEFAULT_TRACING false #define DEFAULT_THREAD_COUNT 1 #define DEFAULT_THREAD_PARTITION thread::partition::tube diff --git a/src/tool_convert.cc b/src/tool_convert.cc new file mode 100644 index 0000000..592fcd7 --- /dev/null +++ b/src/tool_convert.cc @@ -0,0 +1,169 @@ + +#include "cache.h" +#include "compatible.h" +#include "error.h" +#include "file.h" +#include "matrix.h" +#include "operation.h" +#include "tensor.h" +#include "tool.h" +#include "utility.h" +#include "vector.h" +#include +#include +#include +#include +#include +#include +#include + +extern cache_t *cache; +extern uint cache_size; +extern uint cache_line_size; +extern bool emit_latex; +extern uint iterations; +extern char *tool_name; +extern tool::type_t tool_type; +extern bool simulate; +extern bool verbose; +extern verbosity::type_t noisiness; +extern bool write_results; + +void +convert_tool_usage() +{ + print_tool_banner(); + message("\nUsage:\n"); + message("\t%s [options] [output]\n", tool_name); + message("\nOptions:\n"); + message("\t-h\tthis screen\n"); + message("\t-l\temit LaTeX code as output (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_EMIT_LATEX)); + message("\t-s\tstrategy (default: %s)\n", strategy_to_string(DEFAULT_STRATEGY)); + print_strategies("\t\t- %s\n"); + message("\t-o\torientation (default: %s)\n", orientation_to_string(DEFAULT_ORIENTATION)); + print_orientations("\t\t- %s\n"); + message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE)); + message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max); + message("\nExample:\n\n"); + message("\t$ ./tensor %s -s compressed -o column ieee-fig4.in tensor.out\n", tool_name); + message("\tReading ieee-fig4.in ... done [0.000305]\n"); + message("\tConverting from 'coordinate' to 'compressed-column' ... done [0.000010]\n"); + message("\tWriting tensor.out ... done [0.000031]\n"); + exit(1); +} + +tensor_t* +timed_tensor_convert(tensor_t *source, strategy::type_t strategy, orientation::type_t orientation) +{ + precision_timer_t t; + tensor_t *tensor; + + progress("Converting from '%s' to '%s-%s' ... ", + strategy_to_string(source->strategy), + strategy_to_string(strategy), + orientation_to_string(orientation)); + timer_start(&t); + tensor = tensor_convert(source, strategy, orientation); + timer_end(&t); + print_elapsed_time(t); + + return tensor; +} + +void +convert_tool_main(int argc, char *argv[]) +{ + int c, offset; + char *name; + tensor_t *tensor, *result; + strategy::type_t strategy; + orientation::type_t orientation; + + /* just to be safe, set the tensors to null */ + tensor = result = NULL; + + /* set the program's defaults */ + orientation = DEFAULT_ORIENTATION; + strategy = DEFAULT_STRATEGY; + + /* we will privide our own error messages */ + opterr = 0; + + /* extract any command-line options the user provided */ + while (-1 != (c = getopt(argc, argv, ":hlo:s:vV:"))) { + switch (c) { + case 'h': + convert_tool_usage(); + break; + case 'l': + emit_latex = !emit_latex; + break; + case 'o': + if (isdigit(optarg[0])) { + orientation = (orientation::type_t) atoi(optarg); + } else { + orientation = string_to_orientation(optarg); + } + break; + case 's': + if (isdigit(optarg[0])) { + strategy = (strategy::type_t) atoi(optarg); + } else { + strategy = string_to_strategy(optarg); + } + break; + case 'v': + verbose = !verbose; + break; + case 'V': + noisiness = (verbosity::type_t) atoi(optarg); + if (0 == noisiness) { + noisiness = DEFAULT_VERBOSITY; + } + break; + case ':': + die("Option -%c requires an operand; that is, an integer or string value.\n", optopt); + break; + case '?': + die("Unknown option: `-%c'\n", optopt); + break; + default: + abort(); + break; + } + } + + if (noisiness > DEFAULT_VERBOSITY) { + verbose = true; + } + + /* count the number of remaining arguments */ + if (argc-optind < 1) { + convert_tool_usage(); + } + + /* print program options, for debugging purposes */ + print_tool_options(); + debug("convert_tool_main: orientation='%s'\n", orientation_to_string(orientation)); + debug("convert_tool_main: strategy='%s'\n", strategy_to_string(strategy)); + + /* parse the remaining command line options */ + offset = optind; + name = argv[offset++]; + tensor = timed_tensor_read(name); + debug("main: tensor=0x%x\n", tensor); + + if (strategy == tensor->strategy) { + /* we'll deal with differing orientation when it comes up */ + result = tensor; + tensor = NULL; + } else { + result = timed_tensor_convert(tensor, strategy, orientation); + } + debug("main: result=0x%x\n", result); + + timed_tensor_write(argc, argv, offset, result); + + tensor_free(result); + tensor_free(tensor); +} diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc index 6db2945..ad0ce22 100644 --- a/src/tool_effectuate.cc +++ b/src/tool_effectuate.cc @@ -26,6 +26,8 @@ extern uint cache_line_size; extern bool human_readable; extern uint iterations; extern uint memory_stride; +extern orientation::type_t storage_orientation; +extern strategy::type_t storage_strategy; extern uint thread_count; extern thread::partition::type_t thread_partition; extern char *tool_name; @@ -53,20 +55,25 @@ effectuate_tool_usage() message("\t-n\tnumber of times to apply operation (default: %d)\n", DEFAULT_ITERATIONS); message("\t-o\toperation (default: %s)\n", operation_to_string(DEFAULT_OPERATION)); print_operations_with_descriptions("\t\t- %s : %s\n"); + message("\t-O\torientation (default: %s)\n", orientation_to_string(DEFAULT_ORIENTATION)); + print_orientations("\t\t- %s\n"); #if !defined (NOSIMULATE) message("\t-s\tsimulate cache (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_SIMULATE)); #endif - message("\t-t\tnumer of thread_count to use (default: %d)\n", DEFAULT_THREAD_COUNT); + message("\t-S\tstrategy (default: %s)\n", strategy_to_string(DEFAULT_STRATEGY)); + print_strategies("\t\t- %s\n"); + message("\t-p\tpartition scheme for work (default: %s)\n", thread_partition_to_string(DEFAULT_THREAD_PARTITION)); + print_thread_partitions_with_descriptions("\t\t- %s : %s\n"); + message("\t-t\tnumber of threads to use for operation (default: %d)\n", DEFAULT_THREAD_COUNT); message("\t-T\ttoggle tracing (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_TRACING)); message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE)); message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max); message("\t-w\twrite results (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_WRITE_RESULTS)); message("\nExample:\n\n"); - message("\t$ ./tensor %s -o n-mode vector.in tensor.in matrix.out\n", tool_name); + message("\t$ ./tensor %s -o n-mode vector100.in dense100.in\n", tool_name); message("\tReading vector.in ... done [0.000305]\n"); message("\tReading tensor.in ... done [0.000235]\n"); - message("\tConverting from 'coordinate' to 'compressed-column' ... done [0.000010]\n"); - message("\tWriting matrix.out ... done [0.000031]\n"); + message("\tPerforming operation 'dense tensor \times vector product' ... done [3.736000]"); exit(1); } @@ -186,6 +193,8 @@ effectuate_tool_main(int argc, char *argv[]) operand_association = DEFAULT_ASSOCIATION; memory_stride = DEFAULT_MEMORY_STRIDE; optcode = DEFAULT_OPERATION; + storage_orientation = DEFAULT_ORIENTATION; + storage_strategy = DEFAULT_STRATEGY; thread_count = DEFAULT_THREAD_COUNT; thread_partition = DEFAULT_THREAD_PARTITION; @@ -193,7 +202,7 @@ effectuate_tool_main(int argc, char *argv[]) opterr = 0; /* extract any command-line options the user provided */ - while (-1 != (c = getopt(argc, argv, ":a:hl:m:n:o:p:r:st:TuvV:w"))) { + while (-1 != (c = getopt(argc, argv, ":a:hl:m:n:o:O:p:r:sS:t:TuvV:w"))) { switch (c) { case 'a': if (isdigit(optarg[0])) { @@ -230,6 +239,13 @@ effectuate_tool_main(int argc, char *argv[]) optcode = string_to_operation(optarg); } break; + case 'O': + if (isdigit(optarg[0])) { + storage_orientation = (orientation::type_t) atoi(optarg); + } else { + storage_orientation = string_to_orientation(optarg); + } + break; case 'p': if (isdigit(optarg[0])) { thread_partition = (thread::partition::type_t) atoi(optarg); @@ -246,6 +262,13 @@ effectuate_tool_main(int argc, char *argv[]) case 's': simulate = !simulate; break; + case 'S': + if (isdigit(optarg[0])) { + storage_strategy = (strategy::type_t) atoi(optarg); + } else { + storage_strategy = string_to_strategy(optarg); + } + break; case 't': thread_count = atoi(optarg); if (0 == thread_count) { @@ -296,6 +319,8 @@ effectuate_tool_main(int argc, char *argv[]) debug("effectuate_tool_main: operand_association='%s'\n", association_to_string(operand_association)); debug("effectuate_tool_main: operation='%s'\n", operation_to_string(optcode)); debug("effectuate_tool_main: memory_stride=%d\n", memory_stride); + debug("effectuate_tool_main: storage_orientation='%s'\n", orientation_to_string(storage_orientation)); + debug("effectuate_tool_main: storage_strategy='%s'\n", strategy_to_string(storage_strategy)); debug("effectuate_tool_main: thread_count=%d\n", thread_count); debug("effectuate_tool_main: thread_partition='%s'\n", thread_partition_to_string(thread_partition)); From 757a983a8504ad74e7c261f8c03f46c40c659a4f Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 10:59:07 -0700 Subject: [PATCH 45/57] + We are working with DENSE tensors, so reflect this in out output messages --- src/operation_utility.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operation_utility.cc b/src/operation_utility.cc index 2c7dd4d..f95c3c0 100644 --- a/src/operation_utility.cc +++ b/src/operation_utility.cc @@ -12,7 +12,7 @@ static char const *map_operations_to_string[] = { static char const *map_operations_to_description[] = { "unknown", - "dense vector \\times sparse tensor product" + "dense tensor \\times vector product" }; char const* From 025d0dab7f929c0fe906a3ee616257136aaec45a Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 11:01:42 -0700 Subject: [PATCH 46/57] + All operations are now of the form tensor operation operand --- src/main.cc | 1 - src/operation.h | 10 ---------- src/tool.h | 1 - src/tool_effectuate.cc | 10 +--------- 4 files changed, 1 insertion(+), 21 deletions(-) diff --git a/src/main.cc b/src/main.cc index 9fc3efa..ee257e2 100644 --- a/src/main.cc +++ b/src/main.cc @@ -18,7 +18,6 @@ #include #include -association::type_t operand_association; cache_t *cache; uint cache_size; uint cache_line_size; diff --git a/src/operation.h b/src/operation.h index 8576ebb..d8f6bce 100644 --- a/src/operation.h +++ b/src/operation.h @@ -13,18 +13,8 @@ namespace operation { } type_t; } -namespace association { - typedef enum { - unknown, - left, - right - } type_t; -} - -char const* association_to_string(association::type_t association); char const* operation_to_string(operation::type_t operation); char const* operation_to_description_string(operation::type_t operation); -association::type_t string_to_association(char const *name); operation::type_t string_to_operation(char const *name); void operation_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor); diff --git a/src/tool.h b/src/tool.h index 9946bbe..d913336 100644 --- a/src/tool.h +++ b/src/tool.h @@ -20,7 +20,6 @@ namespace tool { #define OPTION_MESSAGE(x,a,b) (x ? a:b) #define DEFAULT_ON_OR_OFF(x) OPTION_MESSAGE(x, "on", "off") -#define DEFAULT_ASSOCIATION association::left #define DEFAULT_HUMAN_READABLE true #define DEFAULT_ITERATIONS 1 #define DEFAULT_MEMORY_STRIDE 32 diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc index ad0ce22..f01c129 100644 --- a/src/tool_effectuate.cc +++ b/src/tool_effectuate.cc @@ -190,7 +190,6 @@ effectuate_tool_main(int argc, char *argv[]) int c; /* set the program's defaults */ - operand_association = DEFAULT_ASSOCIATION; memory_stride = DEFAULT_MEMORY_STRIDE; optcode = DEFAULT_OPERATION; storage_orientation = DEFAULT_ORIENTATION; @@ -202,15 +201,8 @@ effectuate_tool_main(int argc, char *argv[]) opterr = 0; /* extract any command-line options the user provided */ - while (-1 != (c = getopt(argc, argv, ":a:hl:m:n:o:O:p:r:sS:t:TuvV:w"))) { + while (-1 != (c = getopt(argc, argv, ":hl:m:n:o:O:p:r:sS:t:TuvV:w"))) { switch (c) { - case 'a': - if (isdigit(optarg[0])) { - operand_association = (association::type_t) atoi(optarg); - } else { - operand_association = string_to_association(optarg); - } - break; case 'h': effectuate_tool_usage(); break; From 038d3de0e8809acb210c2a3b4bdbaae01f2f8de4 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 11:02:15 -0700 Subject: [PATCH 47/57] + All operations are now of the form tensor operation operand --- src/tool_effectuate.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc index f01c129..e857674 100644 --- a/src/tool_effectuate.cc +++ b/src/tool_effectuate.cc @@ -19,7 +19,6 @@ #include #include -extern association::type_t operand_association; extern cache_t *cache; extern uint cache_size; extern uint cache_line_size; From bf75d179b16991955d72c1d3a1f5cf1f16efaa76 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 11:03:42 -0700 Subject: [PATCH 48/57] + All operations are now of the form tensor operation operand --- src/operation_utility.cc | 26 -------------------------- src/tool_effectuate.cc | 1 - 2 files changed, 27 deletions(-) diff --git a/src/operation_utility.cc b/src/operation_utility.cc index f95c3c0..e5a9b83 100644 --- a/src/operation_utility.cc +++ b/src/operation_utility.cc @@ -60,29 +60,3 @@ print_operations_with_descriptions(char const *format) message(format, map_operations_to_string[i], map_operations_to_description[i]); } } - -static char const *map_associations_to_string[] = { - "unknown", - "left", - "right" -}; - -char const* -association_to_string(association::type_t association) -{ - return map_associations_to_string[association]; -} - -association::type_t -string_to_association(char const *name) -{ - uint i; - - for (i = 0; i < COUNT_OF(map_associations_to_string); ++i) { - if (0 == strcmp(name, map_associations_to_string[i])) { - return (association::type_t) i; - } - } - - return association::unknown; -} diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc index e857674..92f788d 100644 --- a/src/tool_effectuate.cc +++ b/src/tool_effectuate.cc @@ -307,7 +307,6 @@ effectuate_tool_main(int argc, char *argv[]) /* print program options, for debugging purposes */ print_tool_options(); - debug("effectuate_tool_main: operand_association='%s'\n", association_to_string(operand_association)); debug("effectuate_tool_main: operation='%s'\n", operation_to_string(optcode)); debug("effectuate_tool_main: memory_stride=%d\n", memory_stride); debug("effectuate_tool_main: storage_orientation='%s'\n", orientation_to_string(storage_orientation)); From b78c5307e89ee2139c397f604a6160ed0793f8cd Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 11:14:00 -0700 Subject: [PATCH 49/57] + Changed partition name tube to fiber (for correctness) --- src/operation_n_mode_product.cc | 67 +++++---------------------------- src/thread.cc | 4 +- src/thread.h | 2 +- src/tool.h | 2 +- 4 files changed, 13 insertions(+), 62 deletions(-) diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index 3d259e3..a35b4f8 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -12,7 +12,6 @@ #include #include -extern association::type_t operand_association; extern cache_t *cache; extern uint memory_stride; extern uint thread_count; @@ -30,18 +29,6 @@ extern thread::partition::type_t thread_partition; end for end for end for - - Computing ($Tp$): - Let $\T \in R^{n\times n\times n}$ be a tensor. - Let $\M \in R^{n\times n}$ be a matrix. - Let $p \in R^{n}$ be a vector. - for i = 1 to l do - for j = 1 to m do - for k = 1 to m do - M[i][j] += T[j][i][k] * p[k] - end for - end for - end for */ typedef struct { @@ -52,7 +39,7 @@ typedef struct { } product_thread_data_t; int -tube_next(product_thread_data_t *data) +fiber_next(product_thread_data_t *data) { volatile uint k; @@ -63,26 +50,12 @@ tube_next(product_thread_data_t *data) } void -tube_product_pT(product_thread_data_t *data, uint n, double **M, double *P, double *T) +fiber_product_tube(product_thread_data_t *data, uint n, double **M, double *P, double *T) { int t; uint i, j, offset; - while (-1 != (t = tube_next(data))) { - offset = t*n; - i = t/n; - j = t%n; - M[i][j] = array_inner_product(n, P, 1, T+offset, 1); - } -} - -void -tube_product_Tp(product_thread_data_t *data, uint n, double **M, double *P, double *T) -{ - int t; - uint i, j, offset; - - while (-1 != (t = tube_next(data))) { + while (-1 != (t = fiber_next(data))) { offset = t*n; i = t/n; j = t%n; @@ -91,17 +64,13 @@ tube_product_Tp(product_thread_data_t *data, uint n, double **M, double *P, doub } thread_address_t -tube_product(thread_argument_t *argument) +fiber_product(thread_argument_t *argument) { product_thread_data_t *data; data = (product_thread_data_t*) thread_data(argument); - if (association::left == operand_association) { - tube_product_pT(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); - } else { - tube_product_Tp(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); - } + fiber_product_tube(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); return NULL; } @@ -118,7 +87,7 @@ slice_next(product_thread_data_t *data) } void -slice_product_pT(product_thread_data_t *data, uint n, double **M, double *P, double *T) +slice_product_horizontal(product_thread_data_t *data, uint n, double **M, double *P, double *T) { int i; uint j, ioffset, joffset; @@ -132,20 +101,6 @@ slice_product_pT(product_thread_data_t *data, uint n, double **M, double *P, dou } } -void -slice_product_Tp(product_thread_data_t *data, uint n, double **M, double *P, double *T) -{ - int i; - uint j, ioffset, joffset; - - while (-1 != (i = slice_next(data))) { - ioffset = i*n*n; - for (j = 0; j < n; ++j) { - joffset = ioffset+j*n; - M[i][j] = array_inner_product(n, P, 1, T+joffset, 1); - } - } -} thread_address_t slice_product(thread_argument_t *argument) @@ -154,11 +109,7 @@ slice_product(thread_argument_t *argument) data = (product_thread_data_t*) thread_data(argument); - if (association::left == operand_association) { - slice_product_pT(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); - } else { - slice_product_Tp(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); - } + slice_product_horizontal(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); return NULL; } @@ -188,8 +139,8 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t thread_function_t function; switch (thread_partition) { - case thread::partition::tube: - function = (thread_function_t) &tube_product; + case thread::partition::fiber: + function = (thread_function_t) &fiber_product; break; case thread::partition::slice: function = (thread_function_t) &slice_product; diff --git a/src/thread.cc b/src/thread.cc index 186e844..442db57 100644 --- a/src/thread.cc +++ b/src/thread.cc @@ -12,13 +12,13 @@ static char const *map_thread_partition_to_string[] = { "unknown", - "tube", + "fiber", "slice" }; static char const *map_thread_partition_to_description[] = { "unknown", - "tube per thread", + "fiber per thread", "slice per thread" }; diff --git a/src/thread.h b/src/thread.h index ace57b2..3db2bb1 100644 --- a/src/thread.h +++ b/src/thread.h @@ -20,7 +20,7 @@ namespace thread { namespace partition { typedef enum { unknown, - tube, + fiber, slice } type_t; } diff --git a/src/tool.h b/src/tool.h index d913336..46291ed 100644 --- a/src/tool.h +++ b/src/tool.h @@ -29,7 +29,7 @@ namespace tool { #define DEFAULT_STRATEGY strategy::array #define DEFAULT_TRACING false #define DEFAULT_THREAD_COUNT 1 -#define DEFAULT_THREAD_PARTITION thread::partition::tube +#define DEFAULT_THREAD_PARTITION thread::partition::fiber #define DEFAULT_VERBOSE false #define DEFAULT_VERBOSITY verbosity::low #define DEFAULT_WRITE_RESULTS false From 731271c6006883813a1ab52a8d1994e98525702c Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 18:14:47 -0700 Subject: [PATCH 50/57] + Added a UI to handle data partitioning --- src/Makefile | 14 ++++---- src/data.cc | 57 +++++++++++++++++++++++++++++++++ src/data.h | 21 ++++++++++++ src/main.cc | 39 +++++++++++----------- src/operation.cc | 2 +- src/operation_n_mode_product.cc | 37 ++++++++++++--------- src/thread.cc | 52 ------------------------------ src/thread.h | 24 -------------- src/tool.h | 4 +-- src/tool_effectuate.cc | 55 +++++++++++++++---------------- 10 files changed, 157 insertions(+), 148 deletions(-) create mode 100644 src/data.cc create mode 100644 src/data.h diff --git a/src/Makefile b/src/Makefile index d47ab7f..50680aa 100644 --- a/src/Makefile +++ b/src/Makefile @@ -26,9 +26,9 @@ ifeq "$(OS)" "Darwin" endif HEADERS_CACHE=address.h cache.h hash.h -HEADERS_GENERAL=arithmetic.h error.h file.h information.h latex.h \ - algebra.h memory.h operation.h random.h thread.h strings.h \ - timer.h tool.h utility.h compatible.h +HEADERS_GENERAL=arithmetic.h data.h error.h file.h information.h \ + latex.h algebra.h memory.h operation.h queue.h random.h \ + thread.h strings.h timer.h tool.h utility.h compatible.h HEADERS_GENERATE=generate.h HEADERS_MATRIX=matrix.h mmio.h HEADERS_TENSOR=tensor.h @@ -37,12 +37,12 @@ HEADERS=$(HEADERS_CACHE) $(HEADERS_GENERAL) $(HEADERS_GENERATE) \ $(HEADERS_MATRIX) $(HEADERS_TENSOR) $(HEADERS_VECTOR) SOURCES_CACHE=address.cc cache.cc hash.cc -SOURCES_GENERAL=arithmetic.cc compatible.cc error.cc file.cc \ +SOURCES_GENERAL=arithmetic.cc data.cc compatible.cc error.cc file.cc \ information.cc latex.cc algebra.cc memory.cc mmio.cc \ operation.cc operation_n_mode_product.cc operation_utility.cc \ - random.cc strings.cc thread.cc timer.cc tool_effectuate.cc \ - tool_generate.cc tool_timing.cc tool_utility.cc types.cc \ - utility.cc + queue.cc random.cc strings.cc thread.cc timer.cc \ + tool_effectuate.cc tool_generate.cc tool_timing.cc \ + tool_utility.cc types.cc utility.cc SOURCES_GENERATE=generate_tensor_from_matrix.cc SOURCES_MATRIX=matrix_arithmetic.cc matrix_clear.cc \ matrix_compatible.cc matrix_copy.cc matrix_free.cc \ diff --git a/src/data.cc b/src/data.cc new file mode 100644 index 0000000..86086d3 --- /dev/null +++ b/src/data.cc @@ -0,0 +1,57 @@ + +#include "data.h" +#include "error.h" +#include "utility.h" +#include + +static char const *map_data_partition_to_string[] = { + "unknown", + "fiber", + "slice" +}; + +static char const *map_data_partition_to_description[] = { + "unknown", + "fiber per data", + "slice per data" +}; + +char const* +data_partition_to_string(data::partition::type_t partition) +{ + return map_data_partition_to_string[partition]; +} + +data::partition::type_t +string_to_data_partition(char const *name) +{ + uint i; + + for (i = 0; i < COUNT_OF(map_data_partition_to_string); ++i) { + if (0 == strcmp(name, map_data_partition_to_string[i])) { + return (data::partition::type_t) i; + } + } + + return data::partition::unknown; +} + +void +print_data_partitions(char const *format) +{ + uint i; + + for (i = 1; i < COUNT_OF(map_data_partition_to_string); ++i) { + message(format, map_data_partition_to_string[i]); + } +} + +void +print_data_partitions_with_descriptions(char const *format) +{ + uint i; + + for (i = 1; i < COUNT_OF(map_data_partition_to_string); ++i) { + message(format, map_data_partition_to_string[i], map_data_partition_to_description[i]); + } +} diff --git a/src/data.h b/src/data.h new file mode 100644 index 0000000..f8ab685 --- /dev/null +++ b/src/data.h @@ -0,0 +1,21 @@ + +#ifndef _DATA_H_ +#define _DATA_H_ + +namespace data { + namespace partition { + typedef enum { + unknown, + fiber, + slice + } type_t; + } +} + +char const* data_partition_to_string(data::partition::type_t partition); +data::partition::type_t string_to_data_partition(char const *name); +void print_data_partitions(char const *format); +void print_data_partitions_with_descriptions(char const *format); + +#endif /* _DATA_H_ */ + diff --git a/src/main.cc b/src/main.cc index ee257e2..88bc1e7 100644 --- a/src/main.cc +++ b/src/main.cc @@ -1,6 +1,7 @@ #include "cache.h" #include "compatible.h" +#include "data.h" #include "error.h" #include "file.h" #include "matrix.h" @@ -18,25 +19,25 @@ #include #include -cache_t *cache; -uint cache_size; -uint cache_line_size; -uint iterations; -uint memory_stride; -thread::partition::type_t thread_partition; -uint seed; -orientation::type_t storage_orientation; -strategy::type_t storage_strategy; -uint thread_count; -char *tool_name; -tool::type_t tool_type; -bool tracing; -bool simulate; -bool human_readable; -bool verbose; -verbosity::type_t noisiness; -bool write_results; -bool emit_latex; +cache_t *cache; +uint cache_size; +uint cache_line_size; +uint iterations; +uint memory_stride; +data::partition::type_t data_partition; +uint seed; +orientation::type_t storage_orientation; +strategy::type_t storage_strategy; +uint thread_count; +char *tool_name; +tool::type_t tool_type; +bool tracing; +bool simulate; +bool human_readable; +bool verbose; +verbosity::type_t noisiness; +bool write_results; +bool emit_latex; void usage() diff --git a/src/operation.cc b/src/operation.cc index c6a8fd5..2cd16d7 100644 --- a/src/operation.cc +++ b/src/operation.cc @@ -25,7 +25,7 @@ threaded_n_mode_product(matrix_t *matrix, vector_t const *vector, tensor_t const threaded_n_mode_product_array(matrix, vector, tensor); break; default: - die("Tensor product for '%s' strategy (using thread_count) is not currently supported.\n", + die("threaded_n_mode_product: tensor product for '%s' strategy (using threads) is not currently supported.\n", strategy_to_string(tensor->strategy)); break; } diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index a35b4f8..9d83fc4 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -1,5 +1,6 @@ #include "algebra.h" +#include "data.h" #include "cache.h" #include "compatible.h" #include "error.h" @@ -12,10 +13,12 @@ #include #include -extern cache_t *cache; -extern uint memory_stride; -extern uint thread_count; -extern thread::partition::type_t thread_partition; +extern cache_t *cache; +extern uint memory_stride; +extern orientation::type_t storage_orientation; +extern strategy::type_t storage_strategy; +extern uint thread_count; +extern data::partition::type_t data_partition; /* Computing ($pT$): @@ -38,6 +41,8 @@ typedef struct { tensor_t const *tensor; } product_thread_data_t; +typedef void (*n_mode_product_t)(product_thread_data_t *data, uint n, double **M, double *P, double *T); + int fiber_next(product_thread_data_t *data) { @@ -50,7 +55,7 @@ fiber_next(product_thread_data_t *data) } void -fiber_product_tube(product_thread_data_t *data, uint n, double **M, double *P, double *T) +fiber_product_implementation(product_thread_data_t *data, uint n, double **M, double *P, double *T) { int t; uint i, j, offset; @@ -70,7 +75,7 @@ fiber_product(thread_argument_t *argument) data = (product_thread_data_t*) thread_data(argument); - fiber_product_tube(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); + fiber_product_implementation(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); return NULL; } @@ -87,21 +92,21 @@ slice_next(product_thread_data_t *data) } void -slice_product_horizontal(product_thread_data_t *data, uint n, double **M, double *P, double *T) +slice_product_implementation(product_thread_data_t *data, uint n, double **M, double *P, double *T) { int i; uint j, ioffset, joffset; while (-1 != (i = slice_next(data))) { ioffset = i*n*n; + joffset = ioffset; for (j = 0; j < n; ++j) { - joffset = ioffset+j*n; - M[i][j] = array_inner_product(n, P, 1, T+joffset, 1); + M[i][j] = array_inner_product(n, P, 1, T+joffset, 1); + joffset += n; } } } - thread_address_t slice_product(thread_argument_t *argument) { @@ -109,7 +114,7 @@ slice_product(thread_argument_t *argument) data = (product_thread_data_t*) thread_data(argument); - slice_product_horizontal(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); + slice_product_implementation(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); return NULL; } @@ -138,16 +143,16 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t { thread_function_t function; - switch (thread_partition) { - case thread::partition::fiber: + switch (data_partition) { + case data::partition::fiber: function = (thread_function_t) &fiber_product; break; - case thread::partition::slice: + case data::partition::slice: function = (thread_function_t) &slice_product; break; default: - die("serial_n_mode_product: tensor product for '%s' strategy is not currently supported.\n", - strategy_to_string(tensor->strategy)); + die("threaded_n_mode_product_array: tensor product for '%s' partition is not currently supported.\n", + data_partition_to_string(data_partition)); break; } diff --git a/src/thread.cc b/src/thread.cc index 442db57..d3cf79f 100644 --- a/src/thread.cc +++ b/src/thread.cc @@ -10,58 +10,6 @@ #include #include /* for EBUSY */ -static char const *map_thread_partition_to_string[] = { - "unknown", - "fiber", - "slice" -}; - -static char const *map_thread_partition_to_description[] = { - "unknown", - "fiber per thread", - "slice per thread" -}; - -char const* -thread_partition_to_string(thread::partition::type_t partition) -{ - return map_thread_partition_to_string[partition]; -} - -thread::partition::type_t -string_to_thread_partition(char const *name) -{ - uint i; - - for (i = 0; i < COUNT_OF(map_thread_partition_to_string); ++i) { - if (0 == strcmp(name, map_thread_partition_to_string[i])) { - return (thread::partition::type_t) i; - } - } - - return thread::partition::unknown; -} - -void -print_thread_partitions(char const *format) -{ - uint i; - - for (i = 1; i < COUNT_OF(map_thread_partition_to_string); ++i) { - message(format, map_thread_partition_to_string[i]); - } -} - -void -print_thread_partitions_with_descriptions(char const *format) -{ - uint i; - - for (i = 1; i < COUNT_OF(map_thread_partition_to_string); ++i) { - message(format, map_thread_partition_to_string[i], map_thread_partition_to_description[i]); - } -} - /************************************************* * attempt to lock a mutex */ diff --git a/src/thread.h b/src/thread.h index 3db2bb1..e565ccb 100644 --- a/src/thread.h +++ b/src/thread.h @@ -8,30 +8,6 @@ #ifndef _THREAD_H_ #define _THREAD_H_ -namespace thread { - - namespace model { - typedef enum { - unknown, - traditional - } type_t; - } - - namespace partition { - typedef enum { - unknown, - fiber, - slice - } type_t; - } - -} - -char const* thread_partition_to_string(thread::partition::type_t partition); -thread::partition::type_t string_to_thread_partition(char const *name); -void print_thread_partitions(char const *format); -void print_thread_partitions_with_descriptions(char const *format); - /* Linux defs: * _REENTRANT to get thread-safe libs * _POSIX_SOURCE to get POSIX semantics diff --git a/src/tool.h b/src/tool.h index 46291ed..b04db18 100644 --- a/src/tool.h +++ b/src/tool.h @@ -24,12 +24,12 @@ namespace tool { #define DEFAULT_ITERATIONS 1 #define DEFAULT_MEMORY_STRIDE 32 #define DEFAULT_OPERATION operation::n_mode_product -#define DEFAULT_ORIENTATION orientation::row +#define DEFAULT_ORIENTATION orientation::tube #define DEFAULT_SIMULATE false #define DEFAULT_STRATEGY strategy::array #define DEFAULT_TRACING false #define DEFAULT_THREAD_COUNT 1 -#define DEFAULT_THREAD_PARTITION thread::partition::fiber +#define DEFAULT_THREAD_PARTITION data::partition::fiber #define DEFAULT_VERBOSE false #define DEFAULT_VERBOSITY verbosity::low #define DEFAULT_WRITE_RESULTS false diff --git a/src/tool_effectuate.cc b/src/tool_effectuate.cc index 92f788d..fdb2b1b 100644 --- a/src/tool_effectuate.cc +++ b/src/tool_effectuate.cc @@ -1,6 +1,7 @@ #include "cache.h" #include "compatible.h" +#include "data.h" #include "error.h" #include "file.h" #include "matrix.h" @@ -19,23 +20,23 @@ #include #include -extern cache_t *cache; -extern uint cache_size; -extern uint cache_line_size; -extern bool human_readable; -extern uint iterations; -extern uint memory_stride; -extern orientation::type_t storage_orientation; -extern strategy::type_t storage_strategy; -extern uint thread_count; -extern thread::partition::type_t thread_partition; -extern char *tool_name; -extern tool::type_t tool_type; -extern bool simulate; -extern bool tracing; -extern bool verbose; -extern verbosity::type_t noisiness; -extern bool write_results; +extern cache_t *cache; +extern uint cache_size; +extern uint cache_line_size; +extern bool human_readable; +extern uint iterations; +extern uint memory_stride; +extern orientation::type_t storage_orientation; +extern strategy::type_t storage_strategy; +extern uint thread_count; +extern data::partition::type_t data_partition; +extern char *tool_name; +extern tool::type_t tool_type; +extern bool simulate; +extern bool tracing; +extern bool verbose; +extern verbosity::type_t noisiness; +extern bool write_results; static operation::type_t optcode; @@ -54,25 +55,25 @@ effectuate_tool_usage() message("\t-n\tnumber of times to apply operation (default: %d)\n", DEFAULT_ITERATIONS); message("\t-o\toperation (default: %s)\n", operation_to_string(DEFAULT_OPERATION)); print_operations_with_descriptions("\t\t- %s : %s\n"); - message("\t-O\torientation (default: %s)\n", orientation_to_string(DEFAULT_ORIENTATION)); + message("\t-O\tin memory storage orientation (default: %s)\n", orientation_to_string(DEFAULT_ORIENTATION)); print_orientations("\t\t- %s\n"); #if !defined (NOSIMULATE) message("\t-s\tsimulate cache (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_SIMULATE)); #endif - message("\t-S\tstrategy (default: %s)\n", strategy_to_string(DEFAULT_STRATEGY)); + message("\t-S\tin memory storage strategy (default: %s)\n", strategy_to_string(DEFAULT_STRATEGY)); print_strategies("\t\t- %s\n"); - message("\t-p\tpartition scheme for work (default: %s)\n", thread_partition_to_string(DEFAULT_THREAD_PARTITION)); - print_thread_partitions_with_descriptions("\t\t- %s : %s\n"); + message("\t-p\tpartition scheme for work (default: %s)\n", data_partition_to_string(DEFAULT_THREAD_PARTITION)); + print_data_partitions_with_descriptions("\t\t- %s : %s\n"); message("\t-t\tnumber of threads to use for operation (default: %d)\n", DEFAULT_THREAD_COUNT); message("\t-T\ttoggle tracing (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_TRACING)); message("\t-v\ttoggle verbosity (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_VERBOSE)); message("\t-V\tdebug verbosity level (default: %d/%d)\n", DEFAULT_VERBOSITY, verbosity::max); message("\t-w\twrite results (default: %s)\n", DEFAULT_ON_OR_OFF(DEFAULT_WRITE_RESULTS)); message("\nExample:\n\n"); - message("\t$ ./tensor %s -o n-mode vector100.in dense100.in\n", tool_name); + message("\t$ ./tensor %s -o n-mode vector.in tensor.in\n", tool_name); message("\tReading vector.in ... done [0.000305]\n"); message("\tReading tensor.in ... done [0.000235]\n"); - message("\tPerforming operation 'dense tensor \times vector product' ... done [3.736000]"); + message("\tPerforming operation 'dense tensor \\times vector product' ... done [3.736000]\n"); exit(1); } @@ -194,7 +195,7 @@ effectuate_tool_main(int argc, char *argv[]) storage_orientation = DEFAULT_ORIENTATION; storage_strategy = DEFAULT_STRATEGY; thread_count = DEFAULT_THREAD_COUNT; - thread_partition = DEFAULT_THREAD_PARTITION; + data_partition = DEFAULT_THREAD_PARTITION; /* we will privide our own error messages */ opterr = 0; @@ -239,9 +240,9 @@ effectuate_tool_main(int argc, char *argv[]) break; case 'p': if (isdigit(optarg[0])) { - thread_partition = (thread::partition::type_t) atoi(optarg); + data_partition = (data::partition::type_t) atoi(optarg); } else { - thread_partition = string_to_thread_partition(optarg); + data_partition = string_to_data_partition(optarg); } break; case 'r': @@ -312,7 +313,7 @@ effectuate_tool_main(int argc, char *argv[]) debug("effectuate_tool_main: storage_orientation='%s'\n", orientation_to_string(storage_orientation)); debug("effectuate_tool_main: storage_strategy='%s'\n", strategy_to_string(storage_strategy)); debug("effectuate_tool_main: thread_count=%d\n", thread_count); - debug("effectuate_tool_main: thread_partition='%s'\n", thread_partition_to_string(thread_partition)); + debug("effectuate_tool_main: data_partition='%s'\n", data_partition_to_string(data_partition)); /* if we are just running a simulation, then we only do one iteration; otherwise, it would be really slow */ From fb5de7753d82bfc5017a16cddd7319d368a52550 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Mon, 14 Nov 2011 18:15:20 -0700 Subject: [PATCH 51/57] + Basic queue support --- src/queue.cc | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/queue.h | 27 ++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 src/queue.cc create mode 100644 src/queue.h diff --git a/src/queue.cc b/src/queue.cc new file mode 100644 index 0000000..6bfcf28 --- /dev/null +++ b/src/queue.cc @@ -0,0 +1,63 @@ + +#include "queue.h" +#include "error.h" +#include "memory.h" +#include "utility.h" +#include +#include +#include + +queue_t* +queue_malloc() +{ + queue_t *queue; + + superfluous("queue_malloc(max_size=%d)\n", max_size); + + queue = MALLOC(queue_t); + queue->first = 0; + queue->last = MAX_QUEUE_SIZE-1; + queuecount = 0; + + thread_mutex_init(&queue->lock); + + return queue; +} + +void +queue_free(queue_t *queue) +{ + superfluous("queue_free(queue=0x%x)\n", queue); + + thread_mutex_destroy(&queue->lock); + safe_free(queue); +} +void +queue_push(queue_t *queue, queue_node_t *node, uint x) +{ + debug("queue_update(queue=0x%x, node=0x%x, data=0x%x)\n", queue, node, data); + + thread_mutex_lock(&queue->lock); + + queue->last = (queue->last+1) % MAX_QUEUE_SIZE; + queue->data[queue->last] = x; + queue->count++; + + thread_mutex_unlock(&queue->lock); +} + +void +queue_pop(queue_t *queue) +{ + int current, x; + + thread_mutex_lock(&queue->lock); + + x = queue->data[queue->first]; + queue->first = (queue->first+1) % QUEUE_SIZE; + queue->count--; + + return x; +} + + diff --git a/src/queue.h b/src/queue.h new file mode 100644 index 0000000..0838754 --- /dev/null +++ b/src/queue.h @@ -0,0 +1,27 @@ + +#ifndef _QUEUE_H_ +#define _QUEUE_H_ + +#include "thread.h" +#include "types.h" + +#define MAX_QUEUE_SIZE 100 + +typedef struct queue_tag { + uint data[MAX_QUEUE_SIZE]; + uint first, last; + pthread_mutex_t lock; +} queue_t; + +queue_t* queue_malloc(); +void queue_free(queue_t *queue); +void queue_push(queue_t *queue, uint x); +int queue_pop(queue_t *queue); + +#endif /* _HASH_H_ */ + +/* + Local Variables: + mode: C++ + End: +*/ From 02d6d1b4d753d326e065d11cab56642fe2f15c73 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Tue, 15 Nov 2011 09:34:36 -0700 Subject: [PATCH 52/57] + Added a lock-free queue --- src/queue.cc | 98 +++++++++++++++++++++++++++++++++------------------- src/queue.h | 16 ++++----- 2 files changed, 71 insertions(+), 43 deletions(-) diff --git a/src/queue.cc b/src/queue.cc index 6bfcf28..71eb443 100644 --- a/src/queue.cc +++ b/src/queue.cc @@ -8,56 +8,84 @@ #include queue_t* -queue_malloc() +queue_malloc(void) { - queue_t *queue; + queue_t *q; - superfluous("queue_malloc(max_size=%d)\n", max_size); + q = MALLOC(queue_t); + q->head = q->tail = MALLOC(node_t); - queue = MALLOC(queue_t); - queue->first = 0; - queue->last = MAX_QUEUE_SIZE-1; - queuecount = 0; - - thread_mutex_init(&queue->lock); - - return queue; + return q; } void -queue_free(queue_t *queue) +queue_push(queue_t *q, void *data) { - superfluous("queue_free(queue=0x%x)\n", queue); + node_t *node, *tail, *next; - thread_mutex_destroy(&queue->lock); - safe_free(queue); -} -void -queue_push(queue_t *queue, queue_node_t *node, uint x) -{ - debug("queue_update(queue=0x%x, node=0x%x, data=0x%x)\n", queue, node, data); + node = MALLOC(node_t); + node->data = data; + node->next = NULL; - thread_mutex_lock(&queue->lock); + while (true) { + + tail = q->tail; + next = tail->next; + + if (tail != q->tail) { + continue; + } + + if (NULL != next) { + __sync_bool_compare_and_swap(&q->tail, tail, next); + continue; + } + + if (__sync_bool_compare_and_swap(&tail->next, NULL, node)) { + break; + } + + } - queue->last = (queue->last+1) % MAX_QUEUE_SIZE; - queue->data[queue->last] = x; - queue->count++; - - thread_mutex_unlock(&queue->lock); + __sync_bool_compare_and_swap(&q->tail, tail, node); } -void -queue_pop(queue_t *queue) +void* +queue_pop(queue_t *q) { - int current, x; - - thread_mutex_lock(&queue->lock); + void *data; + node_t *head, *tail, *next; + + while (true) { + + head = q->head; + tail = q->tail; + next = head->next; + + if (head != q->head) { + continue; + } + + if (NULL == next) { + return NULL; // Empty + } + + if (head == tail) { + __sync_bool_compare_and_swap(&q->tail, tail, next); + continue; + } + + data = next->data; + + if (__sync_bool_compare_and_swap(&q->head, head, next)) { + break; + } + + } - x = queue->data[queue->first]; - queue->first = (queue->first+1) % QUEUE_SIZE; - queue->count--; + safe_free(head); - return x; + return data; } diff --git a/src/queue.h b/src/queue.h index 0838754..46e28e0 100644 --- a/src/queue.h +++ b/src/queue.h @@ -2,21 +2,21 @@ #ifndef _QUEUE_H_ #define _QUEUE_H_ -#include "thread.h" #include "types.h" -#define MAX_QUEUE_SIZE 100 +typedef struct _node_t { + void *data; + _node_t *next; +} node_t; -typedef struct queue_tag { - uint data[MAX_QUEUE_SIZE]; - uint first, last; - pthread_mutex_t lock; +typedef struct _queue_t { + node_t *head, *tail; } queue_t; queue_t* queue_malloc(); void queue_free(queue_t *queue); -void queue_push(queue_t *queue, uint x); -int queue_pop(queue_t *queue); +void queue_push(queue_t *queue, void *data); +void* queue_pop(queue_t *queue); #endif /* _HASH_H_ */ From 98f2317fce07655fc964c0cb53c7d4369c51f15a Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Tue, 15 Nov 2011 09:37:50 -0700 Subject: [PATCH 53/57] + Added Darwin and Linux code to detect the number of CPUs/cores --- src/thread.cc | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/thread.cc b/src/thread.cc index d3cf79f..4b8edd4 100644 --- a/src/thread.cc +++ b/src/thread.cc @@ -10,6 +10,40 @@ #include #include /* for EBUSY */ +#ifdef __APPLE__ +#include +#include +#endif + +#ifdef __linux__ +#include +#endif + +/************************************************* + * get the number of CPUs on this machine + */ +int +thread_get_cpu_count() +{ +#ifdef __APPLE__ + int i; + size_t s; + + i = 0; + s = sizeof(i); + + if (sysctlbyname("hw.ncpu", &i, &s, NULL, 0)) { + return 1; + } + + return i; +#endif + +#ifdef __linux__ + return get_nprocs(); +#endif +} + /************************************************* * attempt to lock a mutex */ From a41bbec7932422b063a9eca0ae1b2499034c11a1 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Tue, 15 Nov 2011 09:38:39 -0700 Subject: [PATCH 54/57] + Added a new data partitioning scheme --- src/data.cc | 6 ++++-- src/operation_n_mode_product.cc | 35 ++++++++++++++++++++++----------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/data.cc b/src/data.cc index 86086d3..8ce13cc 100644 --- a/src/data.cc +++ b/src/data.cc @@ -7,13 +7,15 @@ static char const *map_data_partition_to_string[] = { "unknown", "fiber", - "slice" + "slice", + "fiber-decomposition" }; static char const *map_data_partition_to_description[] = { "unknown", "fiber per data", - "slice per data" + "slice per data", + "fibers decomposed by binary splitting" }; char const* diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index 9d83fc4..efbfb6f 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -6,6 +6,7 @@ #include "error.h" #include "matrix.h" #include "operation.h" +#include "queue.h" #include "thread.h" #include "tensor.h" #include "utility.h" @@ -55,7 +56,7 @@ fiber_next(product_thread_data_t *data) } void -fiber_product_implementation(product_thread_data_t *data, uint n, double **M, double *P, double *T) +fiber_consumer_implementation(product_thread_data_t *data, uint n, double **M, double *P, double *T) { int t; uint i, j, offset; @@ -69,13 +70,13 @@ fiber_product_implementation(product_thread_data_t *data, uint n, double **M, do } thread_address_t -fiber_product(thread_argument_t *argument) +fiber_consumer(thread_argument_t *argument) { product_thread_data_t *data; data = (product_thread_data_t*) thread_data(argument); - fiber_product_implementation(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); + fiber_consumer_implementation(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); return NULL; } @@ -92,7 +93,7 @@ slice_next(product_thread_data_t *data) } void -slice_product_implementation(product_thread_data_t *data, uint n, double **M, double *P, double *T) +slice_consumer_implementation(product_thread_data_t *data, uint n, double **M, double *P, double *T) { int i; uint j, ioffset, joffset; @@ -108,19 +109,19 @@ slice_product_implementation(product_thread_data_t *data, uint n, double **M, do } thread_address_t -slice_product(thread_argument_t *argument) +slice_consumer(thread_argument_t *argument) { product_thread_data_t *data; data = (product_thread_data_t*) thread_data(argument); - slice_product_implementation(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); + slice_consumer_implementation(data, data->tensor->n, data->matrix->data, data->vector->data, data->tensor->values); return NULL; } void -threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor, thread_function_t function) +threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor, thread_function_t producer, thread_function_t consumer) { product_thread_data_t data; @@ -135,28 +136,38 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t data.vector = vector; data.tensor = tensor; - thread_afork(thread_count, function, &data, NULL); + if (NULL != producer) { + thread_create_detached(producer, &data); + } + + thread_afork(thread_count, consumer, &data, NULL); } void threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor) { - thread_function_t function; + thread_function_t consumer, producer; + + producer = NULL; + consumer = NULL; switch (data_partition) { case data::partition::fiber: - function = (thread_function_t) &fiber_product; + consumer = (thread_function_t) &fiber_consumer; break; case data::partition::slice: - function = (thread_function_t) &slice_product; + consumer = (thread_function_t) &slice_consumer; break; + case data::partition::fiber_decomposition: + consumer = (thread_function_t) &subfiber_consumer; + producer = (thread_function_t) &subfiber_producer; default: die("threaded_n_mode_product_array: tensor product for '%s' partition is not currently supported.\n", data_partition_to_string(data_partition)); break; } - threaded_n_mode_product_array(matrix, vector, tensor, function); + threaded_n_mode_product_array(matrix, vector, tensor, producer, consumer); } void From 96a2f9cfdfab7c9a359391ab1f3d92fbdcfb64df Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Tue, 15 Nov 2011 11:32:08 -0700 Subject: [PATCH 55/57] + Added a 'semi-block recursive' data partitioning scheme --- src/compatible.cc | 2 +- src/data.cc | 8 +++--- src/data.h | 3 ++- src/operation_n_mode_product.cc | 45 ++++++++++++++++++++++++++------- 4 files changed, 43 insertions(+), 15 deletions(-) diff --git a/src/compatible.cc b/src/compatible.cc index a46c7b5..773acb1 100644 --- a/src/compatible.cc +++ b/src/compatible.cc @@ -28,7 +28,7 @@ compatible(vector_t const *lhs, tensor_t const *rhs) strategy_to_string(rhs->strategy)); } - compatible = (lhs->n == rhs->l); + compatible = (lhs->n == rhs->n); if (!compatible) { print_information(lhs); diff --git a/src/data.cc b/src/data.cc index 8ce13cc..492fa26 100644 --- a/src/data.cc +++ b/src/data.cc @@ -8,14 +8,14 @@ static char const *map_data_partition_to_string[] = { "unknown", "fiber", "slice", - "fiber-decomposition" + "block" }; static char const *map_data_partition_to_description[] = { "unknown", - "fiber per data", - "slice per data", - "fibers decomposed by binary splitting" + "fiber per thread", + "slice per thread", + "block per thread" }; char const* diff --git a/src/data.h b/src/data.h index f8ab685..db7d1a5 100644 --- a/src/data.h +++ b/src/data.h @@ -7,7 +7,8 @@ namespace data { typedef enum { unknown, fiber, - slice + slice, + block } type_t; } } diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index efbfb6f..b3b5693 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -81,6 +81,39 @@ fiber_consumer(thread_argument_t *argument) return NULL; } +void +block_consumer_implementation(product_thread_data_t *data, uint n, uint start, uint end, double **M, double *P, double *T) +{ + uint i, j, t, offset; + + for (t = start; t < end; ++t) { + offset = t*n; + i = t/n; + j = t%n; + M[i][j] = array_inner_product(n, P, 1, T+offset, 1); + } +} + +thread_address_t +block_consumer(thread_argument_t *argument) +{ + int id; + uint n, block_size; + uint start, end; + product_thread_data_t *data; + + data = (product_thread_data_t*) thread_data(argument); + n = data->tensor->n; + block_size = (n*n)/thread_count; + id = thread_myid(argument); + start = block_size*id; + end = start+block_size; + + block_consumer_implementation(data, n, start, end, data->matrix->data, data->vector->data, data->tensor->values); + + return NULL; +} + int slice_next(product_thread_data_t *data) { @@ -125,12 +158,6 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t { product_thread_data_t data; - memory_stride = memory_stride > tensor->n ? tensor->n : memory_stride; - thread_count = thread_count > tensor->n ? tensor->n : thread_count; - - debug("threaded_n_mode_product_array: memory_stride=%d\n", memory_stride); - debug("threaded_n_mode_product_array: thread_count=%d\n", thread_count); - data.done = 0; data.matrix = matrix; data.vector = vector; @@ -158,9 +185,9 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t case data::partition::slice: consumer = (thread_function_t) &slice_consumer; break; - case data::partition::fiber_decomposition: - consumer = (thread_function_t) &subfiber_consumer; - producer = (thread_function_t) &subfiber_producer; + case data::partition::block: + consumer = (thread_function_t) &block_consumer; + break; default: die("threaded_n_mode_product_array: tensor product for '%s' partition is not currently supported.\n", data_partition_to_string(data_partition)); From 7cd401edc1b177508a3f8d2f6ea46ce0ef633d35 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Tue, 15 Nov 2011 12:05:47 -0700 Subject: [PATCH 56/57] + Added fiber and block data partitioning --- src/data.cc | 6 ++-- src/data.h | 3 +- src/operation_n_mode_product.cc | 52 +++++++++++++++++++++++++++++---- 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/src/data.cc b/src/data.cc index 492fa26..981fc56 100644 --- a/src/data.cc +++ b/src/data.cc @@ -7,15 +7,17 @@ static char const *map_data_partition_to_string[] = { "unknown", "fiber", + "fiber-block", "slice", - "block" + "slice-block" }; static char const *map_data_partition_to_description[] = { "unknown", "fiber per thread", + "block of fibers per thread" "slice per thread", - "block per thread" + "block of slices per thread" }; char const* diff --git a/src/data.h b/src/data.h index db7d1a5..5c15991 100644 --- a/src/data.h +++ b/src/data.h @@ -7,8 +7,9 @@ namespace data { typedef enum { unknown, fiber, + fiber_block, slice, - block + slice_block } type_t; } } diff --git a/src/operation_n_mode_product.cc b/src/operation_n_mode_product.cc index b3b5693..cd83589 100644 --- a/src/operation_n_mode_product.cc +++ b/src/operation_n_mode_product.cc @@ -82,7 +82,7 @@ fiber_consumer(thread_argument_t *argument) } void -block_consumer_implementation(product_thread_data_t *data, uint n, uint start, uint end, double **M, double *P, double *T) +fiber_block_consumer_implementation(product_thread_data_t *data, uint n, uint start, uint end, double **M, double *P, double *T) { uint i, j, t, offset; @@ -95,7 +95,7 @@ block_consumer_implementation(product_thread_data_t *data, uint n, uint start, u } thread_address_t -block_consumer(thread_argument_t *argument) +fiber_block_consumer(thread_argument_t *argument) { int id; uint n, block_size; @@ -104,12 +104,12 @@ block_consumer(thread_argument_t *argument) data = (product_thread_data_t*) thread_data(argument); n = data->tensor->n; - block_size = (n*n)/thread_count; id = thread_myid(argument); + block_size = (n*n)/thread_count; start = block_size*id; end = start+block_size; - block_consumer_implementation(data, n, start, end, data->matrix->data, data->vector->data, data->tensor->values); + fiber_block_consumer_implementation(data, n, start, end, data->matrix->data, data->vector->data, data->tensor->values); return NULL; } @@ -153,6 +153,43 @@ slice_consumer(thread_argument_t *argument) return NULL; } +void +slice_block_consumer_implementation(int id, product_thread_data_t *data, uint n, uint start, uint end, double **M, double *P, double *T) +{ + uint i, j, ioffset, joffset; + + for (i = start; i < end; ++i) { + ioffset = i*n*n; + joffset = ioffset; + for (j = 0; j < n; ++j) { + M[i][j] = array_inner_product(n, P, 1, T+joffset, 1); + joffset += n; + } + } +} + +thread_address_t +slice_block_consumer(thread_argument_t *argument) +{ + int id; + uint n, block_size; + uint start, end; + product_thread_data_t *data; + + data = (product_thread_data_t*) thread_data(argument); + n = data->tensor->n; + id = thread_myid(argument); + block_size = n/thread_count; + start = block_size*id; + end = start+block_size; + + DEBUG("thread:%d: block_size=%d/%d=%d, start=%d, end=%d\n", id, n, thread_count, block_size, start, end); + + slice_block_consumer_implementation(id, data, n, start, end, data->matrix->data, data->vector->data, data->tensor->values); + + return NULL; +} + void threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t const *tensor, thread_function_t producer, thread_function_t consumer) { @@ -182,11 +219,14 @@ threaded_n_mode_product_array(matrix_t *matrix, vector_t const *vector, tensor_t case data::partition::fiber: consumer = (thread_function_t) &fiber_consumer; break; + case data::partition::fiber_block: + consumer = (thread_function_t) &fiber_block_consumer; + break; case data::partition::slice: consumer = (thread_function_t) &slice_consumer; break; - case data::partition::block: - consumer = (thread_function_t) &block_consumer; + case data::partition::slice_block: + consumer = (thread_function_t) &slice_block_consumer; break; default: die("threaded_n_mode_product_array: tensor product for '%s' partition is not currently supported.\n", From 3fd5b17aabbf0829f30508a43be5e94493a5d330 Mon Sep 17 00:00:00 2001 From: Ben Burnett Date: Thu, 17 Nov 2011 14:45:53 -0700 Subject: [PATCH 57/57] + Fixed gcc warning about the use of an unitialized variable --- src/matrix_write.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/matrix_write.cc b/src/matrix_write.cc index f1b2fa1..4c8f77b 100644 --- a/src/matrix_write.cc +++ b/src/matrix_write.cc @@ -123,7 +123,7 @@ matrix_write(char const *filename, matrix_t const *matrix, format::type_t format { FILE *file; - debug("matrix_write(0x%x)\n", file); + debug("matrix_write('%s')\n", filename); file = fopen_or_die(filename, "w+"); matrix_fwrite(file, matrix, format);