diff --git a/examples/Makefile b/examples/Makefile index 5d32bb00973..3ce2a6b07a8 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -2,7 +2,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2018 The University of Tennessee and The University +# Copyright (c) 2004-2020 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -65,14 +65,15 @@ EXAMPLES = \ oshmem_max_reduction \ oshmem_strided_puts \ oshmem_symmetric_data \ - spc_example + spc_mpit_example \ + spc_mmap_example # Default target. Always build the C MPI examples. Only build the # others if we have the appropriate Open MPI / OpenSHMEM language # bindings. -all: hello_c ring_c connectivity_c spc_example +all: hello_c ring_c connectivity_c spc_mpit_example spc_mmap_example @ if which ompi_info >/dev/null 2>&1 ; then \ $(MAKE) mpi; \ fi @@ -127,7 +128,9 @@ ring_c: ring_c.c $(MPICC) $(CFLAGS) $(LDFLAGS) $? $(LDLIBS) -o $@ connectivity_c: connectivity_c.c $(MPICC) $(CFLAGS) $(LDFLAGS) $? $(LDLIBS) -o $@ -spc_example: spc_example.c +spc_mpit_example: spc_mpit_example.c + $(MPICC) $(CFLAGS) $(LDFLAGS) $? $(LDLIBS) -o $@ +spc_mmap_example: spc_mmap_example.c $(MPICC) $(CFLAGS) $(LDFLAGS) $? $(LDLIBS) -o $@ hello_mpifh: hello_mpifh.f diff --git a/examples/spc_example.c b/examples/spc_example.c deleted file mode 100644 index 11732cd9f40..00000000000 --- a/examples/spc_example.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2018-2020 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * - * Simple example usage of SPCs through MPI_T. - */ - -#include -#include -#include - -#include "mpi.h" - -/* Sends 'num_messages' messages of 'message_size' bytes from rank 0 to rank 1. - * All messages are send synchronously and with the same tag in MPI_COMM_WORLD. - */ -void message_exchange(int num_messages, int message_size) -{ - int i, rank; - /* Use calloc to initialize data to 0's */ - char *data = (char*)calloc(message_size, sizeof(char)); - MPI_Status status; - - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - if(rank == 0) { - for(i = 0; i < num_messages; i++) - MPI_Send(data, message_size, MPI_BYTE, 1, 123, MPI_COMM_WORLD); - } else if(rank == 1) { - for(i = 0; i < num_messages; i++) - MPI_Recv(data, message_size, MPI_BYTE, 0, 123, MPI_COMM_WORLD, &status); - } - - free(data); -} - -int main(int argc, char **argv) -{ - int num_messages, message_size, rc; - - if(argc < 3) { - printf("Usage: mpirun -np 2 --mca mpi_spc_attach all --mca mpi_spc_dump_enabled true ./spc_example [num_messages] [message_size]\n"); - return -1; - } else { - num_messages = atoi(argv[1]); - message_size = atoi(argv[2]); - } - - int i, rank, size, provided, num, name_len, desc_len, verbosity, bind, var_class, readonly, continuous, atomic, count, index; - MPI_Datatype datatype; - MPI_T_enum enumtype; - MPI_Comm comm; - char name[256], description[256]; - - /* Counter names to be read by ranks 0 and 1 */ - char *counter_names[] = {"runtime_spc_OMPI_BYTES_SENT_USER", - "runtime_spc_OMPI_BYTES_RECEIVED_USER" }; - - MPI_Init(NULL, NULL); - MPI_T_init_thread(MPI_THREAD_SINGLE, &provided); - - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &size); - if(size != 2) { - fprintf(stderr, "ERROR: This test should be run with two MPI processes.\n"); - MPI_Abort(MPI_COMM_WORLD, -1); - } - - /* Determine the MPI_T pvar indices for the OMPI_BYTES_SENT/RECIEVED_USER SPCs */ - index = -1; - MPI_T_pvar_get_num(&num); - for(i = 0; i < num; i++) { - name_len = desc_len = 256; - rc = PMPI_T_pvar_get_info(i, name, &name_len, &verbosity, - &var_class, &datatype, &enumtype, description, &desc_len, &bind, - &readonly, &continuous, &atomic); - if( MPI_SUCCESS != rc ) - continue; - if(strcmp(name, counter_names[rank]) == 0) { - index = i; - printf("[%d] %s -> %s\n", rank, name, description); - } - } - - /* Make sure we found the counters */ - if(index == -1) { - fprintf(stderr, "ERROR: Couldn't find the appropriate SPC counter in the MPI_T pvars.\n"); - MPI_Abort(MPI_COMM_WORLD, -1); - } - - int ret; - long long value; - - MPI_T_pvar_session session; - MPI_T_pvar_handle handle; - /* Create the MPI_T sessions/handles for the counters and start the counters */ - ret = MPI_T_pvar_session_create(&session); - ret = MPI_T_pvar_handle_alloc(session, index, NULL, &handle, &count); - ret = MPI_T_pvar_start(session, handle); - - message_exchange(num_messages, message_size); - - ret = MPI_T_pvar_read(session, handle, &value); - /* Print the counter values in order by rank */ - for(i = 0; i < 2; i++) { - if(i == rank) { - printf("[%d] Value Read: %lld\n", rank, value); - fflush(stdout); - } - MPI_Barrier(MPI_COMM_WORLD); - } - /* Stop the MPI_T session, free the handle, and then free the session */ - ret = MPI_T_pvar_stop(session, handle); - ret = MPI_T_pvar_handle_free(session, &handle); - ret = MPI_T_pvar_session_free(&session); - - MPI_T_finalize(); - MPI_Finalize(); - - return 0; -} diff --git a/examples/spc_mmap_example.c b/examples/spc_mmap_example.c new file mode 100644 index 00000000000..95b94a5b50b --- /dev/null +++ b/examples/spc_mmap_example.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Simple example usage of SPCs through an mmap'd file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* This structure will help us store all of the offsets for each + * counter that we want to print out. + */ +typedef struct spc_s { + char name[128]; + int offset; + int rules_offset; + int bins_offset; +} spc_t; + +int main(int argc, char **argv) +{ + if(argc < 4) { + printf("Usage: ./spc_mmap_test [num_messages] [message_size] [XML string]\n"); + return -1; + } + + MPI_Init(NULL, NULL); + + int i, num_messages = atoi(argv[1]), message_size = atoi(argv[2]), rank, shm_fd; + char *buf = (char*)malloc(message_size * sizeof(char)); + + MPI_Request *requests = (MPI_Request*)malloc(num_messages * sizeof(MPI_Request)); + MPI_Status *statuses = (MPI_Status*)malloc(num_messages * sizeof(MPI_Status)); + MPI_Status status; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + int retval, shm_file_size, num_counters, freq_mhz; + long long value; + char filename[128], shm_filename[128], line[128], *token; + + char hostname[128]; + gethostname(hostname, 128); + + char *nodename; + nodename = strtok(hostname, "."); + + char *xml_string = argv[3]; + snprintf(filename, 128, "/dev/shm/spc_data.%s.%s.%d.xml", nodename, xml_string, rank); + + FILE *fptr = NULL; + void *data_ptr; + spc_t *spc_data; + + if(NULL == (fptr = fopen(filename, "r"))) { + printf("Couldn't open xml file.\n"); + MPI_Finalize(); + return -1; + } else { + printf("[%d] Successfully opened the XML file!\n", rank); + } + + /* The following is to read the formatted XML file to get the basic + * information we need to read the shared memory file and properly + * format some counters. + */ + char tmp_filename[128]; + fgets(line, 128, fptr); + fgets(line, 128, fptr); + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%s", shm_filename); + + if(rank == 0) { + printf("shm_filename: %s\n", shm_filename); + } + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &shm_file_size); + if(rank == 0) { + printf("shm_file_size: %d\n", shm_file_size); + } + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &num_counters); + if(rank == 0) { + printf("num_counters: %d\n", num_counters); + } + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &freq_mhz); + if(rank == 0) { + printf("freq_mhz: %d\n", freq_mhz); + } + + if(-1 == (shm_fd = open(shm_filename, O_RDONLY))){ + printf("\nCould not open file '%s'... Error String: %s\n", shm_filename, strerror(errno)); + return -1; + } else { + if(MAP_FAILED == (data_ptr = mmap(0, 8192, PROT_READ, MAP_SHARED, shm_fd, 0))) { + printf("Map failed :(\n"); + return -1; + } + printf("Successfully mmap'd file!\n"); + } + + spc_data = (spc_t*)malloc(num_counters * sizeof(spc_t)); + + for(i = 0; i < num_counters; i++) { + fgets(line, 128, fptr); /* Counter begin header */ + /* This should never happen... */ + if(strcmp(line,"\n") == 0) { + printf("Parsing ended prematurely. There weren't enough counters.\n"); + break; + } + + fgets(line, 128, fptr); /* Counter name header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%s", spc_data[i].name); /* Counter name */ + + fgets(line, 128, fptr); /* Counter value offset header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &spc_data[i].offset); /* Counter offset */ + + fgets(line, 128, fptr); /* Counter rules offset header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &spc_data[i].rules_offset); /* Counter rules offset */ + + fgets(line, 128, fptr); /* Counter bins offset header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &spc_data[i].bins_offset); /* Counter bins offset */ + + fgets(line, 128, fptr); /* Counter end header */ + } + + fclose(fptr); + + /* The following communication pattern is intended to cause a certain + * number of unexpected messages. + */ + if(rank==0) { + for(i=num_messages; i > 0; i--) { + MPI_Isend(buf, message_size, MPI_BYTE, 1, i, MPI_COMM_WORLD, &requests[i-1]); + } + MPI_Send(buf, message_size, MPI_BYTE, 1, 0, MPI_COMM_WORLD); + MPI_Waitall(num_messages, requests, statuses); + + MPI_Barrier(MPI_COMM_WORLD); + + for(i = 0; i < num_counters; i++) { + if((0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_TIME")) || (0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_QUEUE_TIME"))) { + value = (*((long long*)(data_ptr+spc_data[i].offset))) / freq_mhz; + } else { + value = *((long long*)(data_ptr+spc_data[i].offset)); + } + if(value > 0) + printf("[%d] %s\t%lld\n", rank, spc_data[i].name, value ); + } + MPI_Barrier(MPI_COMM_WORLD); + } else { + MPI_Recv(buf, message_size, MPI_BYTE, 0, 0, MPI_COMM_WORLD, &status); + for(i=0; i < num_messages; i++) { + MPI_Recv(buf, message_size, MPI_BYTE, 0, i+1, MPI_COMM_WORLD, &statuses[i]); + } + + MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + for(i = 0; i < num_counters; i++) { + /* These counters are stored in cycles, so we convert them to microseconds. + */ + if((0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_TIME")) || (0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_QUEUE_TIME"))) { + value = (*((long long*)(data_ptr+spc_data[i].offset))) / freq_mhz; + } else { + value = *((long long*)(data_ptr+spc_data[i].offset)); + } + if(value > 0) { + printf("[%d] %s\t%lld\n", rank, spc_data[i].name, value ); + if(spc_data[i].rules_offset > 0) { + int j, *rules = (int*)(data_ptr+spc_data[i].rules_offset); + long long *bins = (long long*)(data_ptr+spc_data[i].bins_offset); + + for(j = 0; j < rules[0]; j++) { + if(j == rules[0]-1) { + printf("\t> %d\t", rules[j]); + } + else { + printf("\t<= %d\t", rules[j+1]); + } + printf("%lld\n", bins[j]); + } + } + } + } + } + + MPI_Finalize(); + + return 0; +} diff --git a/examples/spc_mpit_example.c b/examples/spc_mpit_example.c new file mode 100644 index 00000000000..eb1d7d589f4 --- /dev/null +++ b/examples/spc_mpit_example.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Simple example usage of SPCs through MPI_T. + */ + +#include +#include +#include +#include + +#include "mpi.h" + +/* Sends 'num_messages' messages of 'message_size' bytes from rank 0 to rank 1. + * All messages are send synchronously and with the same tag in MPI_COMM_WORLD. + */ +void message_exchange(int num_messages, int message_size) +{ + int i, rank; + /* Use calloc to initialize data to 0's */ + char *data = (char*)calloc(message_size, sizeof(char)); + MPI_Status status; + MPI_Request req; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + /* This is designed to have at least num_messages unexpected messages in order to + * hit the unexpected message queue counters. The broadcasts are here to showcase + * the collective bin counters and the P2P and Eager message counters. + */ + if(rank == 0) { + for(i = 0; i < num_messages; i++) { + MPI_Isend(data, message_size, MPI_BYTE, 1, 123, MPI_COMM_WORLD, &req); + } + MPI_Send(data, message_size, MPI_BYTE, 1, 321, MPI_COMM_WORLD); + for(i = 0; i < num_messages; i++) { + MPI_Bcast(data, message_size, MPI_BYTE, 0, MPI_COMM_WORLD); + } + } else if(rank == 1) { + MPI_Recv(data, message_size, MPI_BYTE, 0, 321, MPI_COMM_WORLD, &status); + for(i = 0; i < num_messages; i++) { + MPI_Recv(data, message_size, MPI_BYTE, 0, 123, MPI_COMM_WORLD, &status); + } + for(i = 0; i < num_messages; i++) { + MPI_Bcast(data, message_size, MPI_BYTE, 0, MPI_COMM_WORLD); + } + } + /* This should use the binomial algorithm so it has at least one counter value */ + MPI_Bcast(data, 1, MPI_BYTE, 0, MPI_COMM_WORLD); + + free(data); +} + +int main(int argc, char **argv) +{ + int num_messages, message_size, rc; + + if(argc < 3) { + printf("Usage: mpirun -np 2 --mca mpi_spc_attach all --mca mpi_spc_dump_enabled true ./spc_example [num_messages] [message_size]\n"); + return -1; + } else { + num_messages = atoi(argv[1]); + message_size = atoi(argv[2]); + if(message_size <= 0) { + printf("Message size must be positive.\n"); + return -1; + } + } + + int i, j, rank, size, provided, num, name_len, desc_len, verbosity, bind, var_class, readonly, continuous, atomic, count, index, xml_index; + MPI_Datatype datatype; + MPI_T_enum enumtype; + MPI_Comm comm; + char name[256], description[256]; + + /* Counter names to be read by ranks 0 and 1 */ + char *counter_names[] = {"runtime_spc_OMPI_SPC_BASE_BCAST_BINOMIAL", + "runtime_spc_OMPI_SPC_BYTES_RECEIVED_USER" }; + char *xml_counter = "runtime_spc_OMPI_SPC_XML_FILE"; + + MPI_Init(NULL, NULL); + MPI_T_init_thread(MPI_THREAD_SINGLE, &provided); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + if(size != 2) { + fprintf(stderr, "ERROR: This test should be run with two MPI processes.\n"); + MPI_Abort(MPI_COMM_WORLD, -1); + } + + if(rank == 0) { + printf("##################################################################\n"); + printf("This test is designed to highlight several different SPC counters.\n"); + printf("The MPI workload of this test will use 1 MPI_Send and %d MPI_Isend\n", num_messages); + printf("operation(s) on the sender side (rank 0) and %d MPI_Recv operation(s)\n", num_messages+1); + printf("on the receiver side (rank 1) in such a way that at least %d message(s)\n", num_messages); + printf("are unexpected. This highlights the unexpected message queue SPCs.\n"); + printf("There will also be %d MPI_Bcast operation(s) with one of them being of\n", num_messages+1); + printf("size 1 byte, and %d being of size %d byte(s). The 1 byte MPI_Bcast is\n", num_messages, message_size); + printf("meant to ensure that there is at least one MPI_Bcast that uses the\n"); + printf("binomial algorithm so the MPI_T pvar isn't all 0's. The addition of\n"); + printf("the broadcasts also has the effect of showcasing the P2P message size,\n"); + printf("eager vs not eager message, and bytes sent by the user vs MPI SPCs.\n"); + printf("Be sure to set the mpi_spc_dump_enabled MCA parameter to true in order\n"); + printf("to see all of the tracked SPCs.\n"); + printf("##################################################################\n\n"); + } + MPI_Barrier(MPI_COMM_WORLD); + + /* Determine the MPI_T pvar indices for the requested SPCs */ + index = xml_index = -1; + MPI_T_pvar_get_num(&num); + for(i = 0; i < num; i++) { + name_len = desc_len = 256; + rc = PMPI_T_pvar_get_info(i, name, &name_len, &verbosity, + &var_class, &datatype, &enumtype, description, &desc_len, &bind, + &readonly, &continuous, &atomic); + if( MPI_SUCCESS != rc ) + continue; + + if(strcmp(name, xml_counter) == 0) { + xml_index = i; + printf("[%d] %s -> %s\n", rank, name, description); + } + if(strcmp(name, counter_names[rank]) == 0) { + index = i; + printf("[%d] %s -> %s\n", rank, name, description); + } + } + + /* Make sure we found the counters */ + if(index == -1 || xml_index == -1) { + fprintf(stderr, "ERROR: Couldn't find the appropriate SPC counter in the MPI_T pvars.\n"); + MPI_Abort(MPI_COMM_WORLD, -1); + } + + int ret, xml_count; + long long *values = NULL; + char *xml_filename = (char*)malloc(128 * sizeof(char)); + + MPI_T_pvar_session session; + MPI_T_pvar_handle handle; + /* Create the MPI_T sessions/handles for the counters and start the counters */ + ret = MPI_T_pvar_session_create(&session); + ret = MPI_T_pvar_handle_alloc(session, index, NULL, &handle, &count); + ret = MPI_T_pvar_start(session, handle); + + values = (long long*)malloc(count * sizeof(long long)); + + MPI_T_pvar_session xml_session; + MPI_T_pvar_handle xml_handle; + if(xml_index >= 0) { + ret = MPI_T_pvar_session_create(&xml_session); + ret = MPI_T_pvar_handle_alloc(xml_session, xml_index, NULL, &xml_handle, &xml_count); + ret = MPI_T_pvar_start(xml_session, xml_handle); + } + + double timer = MPI_Wtime(); + message_exchange(num_messages, message_size); + timer = MPI_Wtime() - timer; + + printf("[%d] Elapsed time: %lf seconds\n", rank, timer); + + ret = MPI_T_pvar_read(session, handle, values); + if(xml_index >= 0) { + ret = MPI_T_pvar_read(xml_session, xml_handle, &xml_filename); + } + + /* Print the counter values in order by rank */ + for(i = 0; i < 2; i++) { + printf("\n"); + if(i == rank) { + if(xml_index >= 0) { + printf("[%d] XML Counter Value Read: %s\n", rank, xml_filename); + } + for(j = 0; j < count; j++) { + printf("[%d] %s Counter Value Read: %lld\n", rank, counter_names[rank], values[j]); + } + fflush(stdout); + } + MPI_Barrier(MPI_COMM_WORLD); + } + /* Stop the MPI_T session, free the handle, and then free the session */ + ret = MPI_T_pvar_stop(session, handle); + ret = MPI_T_pvar_handle_free(session, &handle); + ret = MPI_T_pvar_session_free(&session); + + MPI_T_finalize(); + MPI_Finalize(); + + return 0; +} diff --git a/examples/spc_snapshot_parse.py b/examples/spc_snapshot_parse.py new file mode 100755 index 00000000000..20e4f98bea5 --- /dev/null +++ b/examples/spc_snapshot_parse.py @@ -0,0 +1,177 @@ +#!/usr/bin/python + +# Copyright (c) 2020 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# +# Simple example usage of SPC snapshots that creates a heatmap of +# SPC usage over time. + +import sys +import glob +import operator +import struct + +import numpy as np +import matplotlib +matplotlib.use('Agg') # For use with headless systems +import matplotlib.pyplot as plt +import matplotlib.cm as cm +import matplotlib.ticker as ticker + +def combine(filename, data): + f = open(filename, 'rb') + for i in range(0,num_counters): + temp = struct.unpack('l', f.read(8))[0] + if 'TIME' in names[i]: + temp /= freq_mhz + data[i].append(temp) + +def fmt(x, pos): + return '{:,.0f}'.format(x) + +# Make sure the proper number of arguments have been supplied +if len(sys.argv) < 4: + print("Usage: ./spc_snapshot_parse.py [/path/to/data/files] [datafile_label] [list_of_spcs]") + exit() + +path = sys.argv[1] +label = sys.argv[2] + +xml_filename = '' +# Lists for storing the snapshot data files from each rank +copies = [] +ends = [] +# Populate the lists with the appropriate data files +for filename in glob.glob(path + "/spc_data*"): + if label in filename: + if xml_filename == '' and '.xml' in filename: + xml_filename = filename + if '.xml' not in filename: + temp = filename.split('/')[-1].split('.') + if len(temp) < 5: + temp[-1] = int(temp[-1]) + ends.append(temp) + else: + temp[-1] = int(temp[-1]) + temp[-2] = int(temp[-2]) + copies.append(temp) + +# Sort the lists +ends = sorted(ends, key = operator.itemgetter(-1)) +for i in range(0,len(ends)): + ends[i][-1] = str(ends[i][-1]) +copies = sorted(copies, key = operator.itemgetter(-2,-1)) +for i in range(0,len(copies)): + copies[i][-1] = str(copies[i][-1]) + copies[i][-2] = str(copies[i][-2]) + +sep = '.' + +xml_file = open(xml_filename, 'r') +num_counters = 0 +freq_mhz = 0 +names = [] +base = [] +# Parse the XML file (same for all data files) +for line in xml_file: + if 'num_counters' in line: + num_counters = int(line.split('>')[1].split('<')[0]) + if 'freq_mhz' in line: + freq_mhz = int(line.split('>')[1].split('<')[0]) + if '' in line: + names.append(line.split('>')[1].split('<')[0]) + value = [names[-1]] + base.append(value) + +prev = copies[0] +i = 0 +ranks = [] +values = [] +times = [] +time = [] + +# Populate the data lists +for n in range(0,len(base)): + values.append([0, names[n]]) +for c in copies: + if c[-2] != prev[-2]: + filename = path + "/" + sep.join(ends[i]) + combine(filename, values) + + ranks.append(values) + times.append(time) + for j in range(0, len(names)): + temp = [ranks[0][j][0]] + + values = [] + time = [] + for n in range(0,len(base)): + values.append([i+1, names[n]]) + i += 1 + + filename = path + "/" + sep.join(c) + time.append(int(filename.split('.')[-1])) + combine(filename, values) + prev = c + +filename = path + "/" + sep.join(ends[i]) +combine(filename, values) +ranks.append(values) +times.append(time) + +spc_list = sys.argv[3].split(",") + +for i in range(0, len(names)): + fig = plt.figure(num=None, figsize=(7, 9), dpi=200, facecolor='w', edgecolor='k') + + plot = False + # Only plot the SPCs of interest + if names[i] in spc_list: + plot = True + + map_data = [] + avg_x = [] + + for j in range(0, len(ranks)): + if avg_x == None: + avg_x = np.zeros(len(times[j])-1) + empty = True + for k in range(2,len(ranks[j][i])): + if ranks[j][i][k] != 0: + empty = False + break + if not empty: + if plot: + xvals = [] + yvals = [] + for l in range(1, len(times[j])): + if ranks[j][i][l+2] - ranks[j][i][l+1] < 0: + break + xvals.append(times[j][l] - times[j][0]) + yvals.append(ranks[j][i][l+2] - ranks[j][i][l+1]) + + map_data.append(yvals) + for v in range(0,len(avg_x)): + avg_x[v] += xvals[v] + if plot: + for v in range(0,len(avg_x)): + avg_x[v] /= float(len(ranks)) + + ax = plt.gca() + im = ax.imshow(map_data, cmap='Reds', interpolation='nearest') + + cbar = ax.figure.colorbar(im, ax=ax, format=ticker.FuncFormatter(fmt)) + cbar.ax.set_ylabel("Counter Value", rotation=-90, va="bottom") + + plt.title(names[i] + ' Snapshot Difference') + + plt.xlabel('Time') + plt.ylabel('MPI Rank') + + ax.set_xticks(np.arange(len(avg_x))) + ax.set_yticks(np.arange(len(map_data))) + ax.set_xticklabels(avg_x) + + plt.show() + fig.savefig(names[i] + '.png') diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c index f3d3fd1d0a7..aca3292d68f 100644 --- a/ompi/mca/coll/base/coll_base_allgather.c +++ b/ompi/mca/coll/base/coll_base_allgather.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -31,6 +31,7 @@ #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -99,6 +100,8 @@ int ompi_coll_base_allgather_intra_bruck(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allgather_intra_bruck rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLGATHER_BRUCK, scount * sdtype->super.size, size); + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -286,6 +289,8 @@ ompi_coll_base_allgather_intra_recursivedoubling(const void *sbuf, int scount, "coll:base:allgather_intra_recursivedoubling rank %d, size %d", rank, size)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLGATHER_RECURSIVE_DOUBLING, scount * sdtype->super.size, size); + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -372,6 +377,8 @@ int ompi_coll_base_allgather_intra_ring(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allgather_intra_ring rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLGATHER_RING, scount * sdtype->super.size, size); + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -508,6 +515,8 @@ ompi_coll_base_allgather_intra_neighborexchange(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allgather_intra_neighborexchange rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLGATHER_NEIGHBOR_EXCHANGE, scount * sdtype->super.size, size); + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -611,6 +620,8 @@ int ompi_coll_base_allgather_intra_two_procs(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_allgather_intra_two_procs rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLGATHER_TWO_PROCS, scount * sdtype->super.size, 2); + if (2 != ompi_comm_size(comm)) { return MPI_ERR_UNSUPPORTED_OPERATION; } @@ -689,6 +700,8 @@ ompi_coll_base_allgather_intra_basic_linear(const void *sbuf, int scount, int err; ptrdiff_t lb, extent; + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLGATHER_LINEAR, scount * sdtype->super.size, ompi_comm_size(comm)); + /* Handle MPI_IN_PLACE (see explanantion in reduce.c for how to allocate temp buffer) -- note that rank 0 can use IN_PLACE natively, and we can just alias the right position in rbuf diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c index 828b32061a9..e34d74f750b 100644 --- a/ompi/mca/coll/base/coll_base_allreduce.c +++ b/ompi/mca/coll/base/coll_base_allreduce.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -36,6 +36,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -63,6 +64,8 @@ ompi_coll_base_allreduce_intra_nonoverlapping(const void *sbuf, void *rbuf, int OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:allreduce_intra_nonoverlapping rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLREDUCE_NONOVERLAPPING, count * dtype->super.size, ompi_comm_size(comm)); + /* Reduce to 0 and broadcast. */ if (MPI_IN_PLACE == sbuf) { @@ -145,6 +148,8 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_intra_recursivedoubling rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLREDUCE_RECURSIVE_DOUBLING, count * dtype->super.size, size); + /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { @@ -358,6 +363,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_intra_ring rank %d, count %d", rank, count)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLREDUCE_RING, count * dtype->super.size, size); + /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { @@ -637,6 +644,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_intra_ring_segmented rank %d, count %d", rank, count)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLREDUCE_RING_SEGMENTED, count * dtype->super.size, size); + /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { @@ -890,6 +899,8 @@ ompi_coll_base_allreduce_intra_basic_linear(const void *sbuf, void *rbuf, int co OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:allreduce_intra_basic_linear rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLREDUCE_LINEAR, count * dtype->super.size, ompi_comm_size(comm)); + /* Reduce to 0 and broadcast. */ if (MPI_IN_PLACE == sbuf) { diff --git a/ompi/mca/coll/base/coll_base_alltoall.c b/ompi/mca/coll/base/coll_base_alltoall.c index a61bf40ca97..74ea98cff05 100644 --- a/ompi/mca/coll/base/coll_base_alltoall.c +++ b/ompi/mca/coll/base/coll_base_alltoall.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -32,6 +32,7 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -53,6 +54,8 @@ mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount, size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLTOALL_INPLACE, rcount * rdtype->super.size, size); + /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; @@ -151,6 +154,8 @@ int ompi_coll_base_alltoall_intra_pairwise(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:alltoall_intra_pairwise rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLTOALL_PAIRWISE, rcount * rdtype->super.size, size); + err = ompi_datatype_get_extent (sdtype, &lb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_extent (rdtype, &lb, &rext); @@ -212,6 +217,8 @@ int ompi_coll_base_alltoall_intra_bruck(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:alltoall_intra_bruck rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLTOALL_BRUCK, rcount * rdtype->super.size, size); + err = ompi_datatype_type_extent (sdtype, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } @@ -358,6 +365,8 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_alltoall_intra_linear_sync rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLTOALL_LINEAR_SYNC, rcount * rdtype->super.size, size); + error = ompi_datatype_get_extent(sdtype, &slb, &sext); if (OMPI_SUCCESS != error) { return error; @@ -512,6 +521,8 @@ int ompi_coll_base_alltoall_intra_two_procs(const void *sbuf, int scount, return MPI_ERR_UNSUPPORTED_OPERATION; } + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLTOALL_TWO_PROCS, rcount * rdtype->super.size, 2); + err = ompi_datatype_get_extent (sdtype, &lb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } @@ -594,6 +605,8 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_alltoall_intra_basic_linear rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_ALLTOALL_LINEAR, rcount * rdtype->super.size, size); + err = ompi_datatype_get_extent(sdtype, &lb, &sndinc); if (OMPI_SUCCESS != err) { return err; diff --git a/ompi/mca/coll/base/coll_base_barrier.c b/ompi/mca/coll/base/coll_base_barrier.c index 49ac4ea2e9e..bc192085b4f 100644 --- a/ompi/mca/coll/base/coll_base_barrier.c +++ b/ompi/mca/coll/base/coll_base_barrier.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -33,6 +33,7 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -109,6 +110,8 @@ int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm, OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank)); + SPC_RECORD(OMPI_SPC_BASE_BARRIER_DOUBLE_RING, 1); + left = ((rank-1)%size); right = ((rank+1)%size); @@ -182,6 +185,8 @@ int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *c "ompi_coll_base_barrier_intra_recursivedoubling rank %d", rank)); + SPC_RECORD(OMPI_SPC_BASE_BARRIER_RECURSIVE_DOUBLING, 1); + /* do nearest power of 2 less than size calc */ adjsize = opal_next_poweroftwo(size); adjsize >>= 1; @@ -262,6 +267,8 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_bruck rank %d", rank)); + SPC_RECORD(OMPI_SPC_BASE_BARRIER_BRUCK, 1); + /* exchange data with rank-2^k and rank+2^k */ for (distance = 1; distance < size; distance <<= 1) { from = (rank + size - distance) % size; @@ -304,6 +311,8 @@ int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_two_procs rank %d", remote)); + SPC_RECORD(OMPI_SPC_BASE_BARRIER_TWO_PROCS, 1); + remote = (remote + 1) & 0x1; err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, @@ -338,6 +347,8 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm, return MPI_SUCCESS; rank = ompi_comm_rank(comm); + SPC_RECORD(OMPI_SPC_BASE_BARRIER_LINEAR, 1); + /* All non-root send & receive zero-length message. */ if (rank > 0) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0, @@ -414,6 +425,8 @@ int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm, "ompi_coll_base_barrier_intra_tree %d", rank)); + SPC_RECORD(OMPI_SPC_BASE_BARRIER_TREE, 1); + /* Find the nearest power of 2 of the communicator size. */ depth = opal_next_poweroftwo_inclusive(size); diff --git a/ompi/mca/coll/base/coll_base_bcast.c b/ompi/mca/coll/base/coll_base_bcast.c index 7af75353d2d..fd7d8f7d564 100644 --- a/ompi/mca/coll/base/coll_base_bcast.c +++ b/ompi/mca/coll/base/coll_base_bcast.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -31,6 +31,7 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -265,6 +266,8 @@ ompi_coll_base_bcast_intra_bintree ( void* buffer, OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binary rank %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_BCAST_BINTREE, count * datatype->super.size, ompi_comm_size(comm)); + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_bintree ); } @@ -293,6 +296,8 @@ ompi_coll_base_bcast_intra_pipeline( void* buffer, OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_pipeline rank %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_BCAST_PIPELINE, count*typelng, ompi_comm_size(comm)); + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_pipeline ); } @@ -321,6 +326,8 @@ ompi_coll_base_bcast_intra_chain( void* buffer, OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), chains, segsize, (unsigned long)typelng, segcount)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_BCAST_CHAIN, count*typelng, ompi_comm_size(comm)); + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_chain ); } @@ -349,6 +356,8 @@ ompi_coll_base_bcast_intra_binomial( void* buffer, OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_BCAST_BINOMIAL, count*typelng, ompi_comm_size(comm)); + return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_bmtree ); } @@ -388,6 +397,8 @@ ompi_coll_base_bcast_intra_split_bintree ( void* buffer, err = ompi_datatype_type_size( datatype, &type_size ); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_BCAST_SPLIT_BINTREE, count*type_size, ompi_comm_size(comm)); + /* Determine number of segments and number of elements per segment */ counts[0] = count/2; if (count % 2 != 0) counts[0]++; @@ -685,6 +696,11 @@ ompi_coll_base_bcast_intra_basic_linear(void *buff, int count, ompi_coll_base_free_reqs(reqs, i); } + size_t typelng; + ompi_datatype_type_size( datatype, &typelng ); + + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_BCAST_LINEAR, count*typelng, size); + /* All done */ return err; } diff --git a/ompi/mca/coll/base/coll_base_gather.c b/ompi/mca/coll/base/coll_base_gather.c index 6fd1e981461..90780792a02 100644 --- a/ompi/mca/coll/base/coll_base_gather.c +++ b/ompi/mca/coll/base/coll_base_gather.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -32,6 +32,7 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -61,6 +62,8 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_GATHER_BINOMIAL, scount * sdtype->super.size, size); + /* create the binomial tree */ COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root ); bmtree = data->cached_in_order_bmtree; @@ -225,6 +228,8 @@ ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_GATHER_LINEAR_SYNC, scount * sdtype->super.size, size); + if (rank != root) { /* Non-root processes: - receive zero byte message from the root, @@ -384,6 +389,8 @@ ompi_coll_base_gather_intra_basic_linear(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_basic_linear rank %d", rank)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_GATHER_LINEAR, scount * sdtype->super.size, size); + if (rank != root) { return MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_GATHER, diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c index dfd709bfb90..e7b7590401d 100644 --- a/ompi/mca/coll/base/coll_base_reduce.c +++ b/ompi/mca/coll/base/coll_base_reduce.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -36,6 +36,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -399,6 +400,8 @@ int ompi_coll_base_reduce_intra_chain( const void *sendbuf, void *recvbuf, int c ompi_datatype_type_size( datatype, &typelng ); COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_CHAIN, count * datatype->super.size, ompi_comm_size(comm)); + return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_chain, @@ -431,6 +434,8 @@ int ompi_coll_base_reduce_intra_pipeline( const void *sendbuf, void *recvbuf, ompi_datatype_type_size( datatype, &typelng ); COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_PIPELINE, count * datatype->super.size, ompi_comm_size(comm)); + return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_pipeline, @@ -462,6 +467,8 @@ int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf, ompi_datatype_type_size( datatype, &typelng ); COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_BINARY, count * datatype->super.size, ompi_comm_size(comm)); + return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_bintree, @@ -493,6 +500,8 @@ int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf, ompi_datatype_type_size( datatype, &typelng ); COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_BINOMIAL, count * datatype->super.size, ompi_comm_size(comm)); + return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_in_order_bmtree, @@ -530,6 +539,8 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv COLL_BASE_UPDATE_IN_ORDER_BINTREE( comm, base_module ); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_IN_ORDER_BINTREE, count * datatype->super.size, size); + /** * Determine number of segments and number of elements * sent per operation @@ -643,6 +654,8 @@ ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, int count rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_LINEAR, count * dtype->super.size, size); + /* If not root, send data to the root. */ if (rank != root) { diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter.c b/ompi/mca/coll/base/coll_base_reduce_scatter.c index 984a91787a0..9a938306ecd 100644 --- a/ompi/mca/coll/base/coll_base_reduce_scatter.c +++ b/ompi/mca/coll/base/coll_base_reduce_scatter.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -35,6 +35,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/op/op.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -62,6 +63,8 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r for (i = 0, total_count = 0; i < size; i++) { total_count += rcounts[i]; } + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_SCATTER_NONOVERLAPPING, total_count * dtype->super.size, size); + /* Reduce to rank 0 (root) and scatterv */ tmprbuf = (char*) rbuf; if (MPI_IN_PLACE == sbuf) { @@ -159,6 +162,8 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf, } count = disps[size - 1] + rcounts[size - 1]; + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_SCATTER_RECURSIVE_HALVING, count * dtype->super.size, size); + /* short cut the trivial case */ if (0 == count) { free(disps); @@ -487,6 +492,8 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, const in if (max_block_count < rcounts[i]) max_block_count = rcounts[i]; } + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_REDUCE_SCATTER_RING, total_count * dtype->super.size, size); + /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { diff --git a/ompi/mca/coll/base/coll_base_scatter.c b/ompi/mca/coll/base/coll_base_scatter.c index 0ca35971532..ebe75c8a39e 100644 --- a/ompi/mca/coll/base/coll_base_scatter.c +++ b/ompi/mca/coll/base/coll_base_scatter.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -32,6 +32,7 @@ #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/runtime/ompi_spc.h" #include "coll_base_topo.h" #include "coll_base_util.h" @@ -78,6 +79,7 @@ ompi_coll_base_scatter_intra_binomial( OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:scatter_intra_binomial rank %d/%d", rank, size)); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_SCATTER_BINOMIAL, scount * sdtype->super.size, size); /* Create the binomial tree */ COLL_BASE_UPDATE_IN_ORDER_BMTREE(comm, base_module, root); @@ -232,6 +234,8 @@ ompi_coll_base_scatter_intra_basic_linear(const void *sbuf, int scount, rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + SPC_COLL_BIN_RECORD(OMPI_SPC_BASE_SCATTER_LINEAR, scount * sdtype->super.size, size); + /* If not root, receive data. */ if (rank != root) { diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 0457d6feb0b..59f2d5e3940 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2019 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, @@ -143,6 +143,7 @@ append_frag_to_ordered_list(mca_pml_ob1_recv_frag_t** queue, d1 = d2; prior = (mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_prev); d2 = prior->hdr.hdr_match.hdr_seq - hdr->hdr_seq; + SPC_RECORD(OMPI_SPC_OOS_QUEUE_HOPS, 1); } while( (hdr->hdr_seq < prior->hdr.hdr_match.hdr_seq) && (d1 > d2) && (prior != *queue) ); } else { @@ -150,6 +151,7 @@ append_frag_to_ordered_list(mca_pml_ob1_recv_frag_t** queue, next_seq = ((mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_next))->hdr.hdr_match.hdr_seq; /* prevent rollover */ while( (hdr->hdr_seq > prior_seq) && (hdr->hdr_seq > next_seq) && (prior_seq < next_seq) ) { + SPC_RECORD(OMPI_SPC_OOS_QUEUE_HOPS, 1); prior_seq = next_seq; prior = (mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_next); next_seq = ((mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_next))->hdr.hdr_match.hdr_seq; @@ -334,6 +336,7 @@ check_cantmatch_for_match(mca_pml_ob1_comm_proc_t *proc) mca_pml_ob1_recv_frag_t *frag = proc->frags_cant_match; if( (NULL != frag) && (frag->hdr.hdr_match.hdr_seq == proc->expected_sequence) ) { + SPC_RECORD(OMPI_SPC_OOS_IN_QUEUE, -1); return remove_head_from_ordered_list(&proc->frags_cant_match); } return NULL; @@ -407,7 +410,15 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, MCA_PML_OB1_RECV_FRAG_ALLOC(frag); MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl); append_frag_to_ordered_list(&proc->frags_cant_match, frag, proc->expected_sequence); +#if SPC_ENABLE == 1 + size_t total_data = segments[0].seg_len; + for(int i = 1; i < num_segments; i++) { + total_data += segments[i].seg_len; + } + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_QUEUE_DATA, OMPI_SPC_OOS_QUEUE_DATA, total_data); +#endif SPC_RECORD(OMPI_SPC_OUT_OF_SEQUENCE, 1); + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE, 1); OB1_MATCHING_UNLOCK(&comm->matching_lock); return; } @@ -503,13 +514,20 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, mca_pml_ob1_recv_frag_t* frag; OB1_MATCHING_LOCK(&comm->matching_lock); +#if SPC_ENABLE == 1 + opal_timer_t timer; + timer = 0; +#endif + SPC_TIMER_START(OMPI_SPC_OOS_MATCH_TIME, &timer); if((frag = check_cantmatch_for_match(proc))) { + SPC_TIMER_STOP(OMPI_SPC_OOS_MATCH_TIME, &timer); /* mca_pml_ob1_recv_frag_match_proc() will release the lock. */ mca_pml_ob1_recv_frag_match_proc(frag->btl, comm_ptr, proc, &frag->hdr.hdr_match, frag->segments, frag->num_segments, frag->hdr.hdr_match.hdr_common.hdr_type, frag); } else { + SPC_TIMER_STOP(OMPI_SPC_OOS_MATCH_TIME, &timer); OB1_MATCHING_UNLOCK(&comm->matching_lock); } } @@ -802,7 +820,8 @@ match_one(mca_btl_base_module_t *btl, mca_pml_ob1_recv_frag_t* frag) { #if SPC_ENABLE == 1 - opal_timer_t timer = 0; + opal_timer_t timer; + timer = 0; #endif SPC_TIMER_START(OMPI_SPC_MATCH_TIME, &timer); @@ -856,6 +875,12 @@ match_one(mca_btl_base_module_t *btl, SPC_TIMER_STOP(OMPI_SPC_MATCH_TIME, &timer); return match; } + SPC_TIMER_STOP(OMPI_SPC_MATCH_TIME, &timer); + +#if SPC_ENABLE == 1 + opal_timer_t queue_timer = 0; +#endif + SPC_TIMER_START(OMPI_SPC_MATCH_QUEUE_TIME, &queue_timer); /* if no match found, place on unexpected queue */ #if MCA_PML_OB1_CUSTOM_MATCH @@ -865,12 +890,21 @@ match_one(mca_btl_base_module_t *btl, append_frag_to_list(&proc->unexpected_frags, btl, hdr, segments, num_segments, frag); #endif + SPC_TIMER_STOP(OMPI_SPC_MATCH_QUEUE_TIME, &queue_timer); + +#if SPC_ENABLE == 1 + size_t total_data = segments[0].seg_len; + for(int i = 1; i < num_segments; i++) { + total_data += segments[i].seg_len; + } + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_UNEXPECTED_QUEUE_DATA, OMPI_SPC_UNEXPECTED_QUEUE_DATA, total_data); +#endif + SPC_RECORD(OMPI_SPC_UNEXPECTED, 1); - SPC_RECORD(OMPI_SPC_UNEXPECTED_IN_QUEUE, 1); - SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, OMPI_SPC_UNEXPECTED_IN_QUEUE); + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, OMPI_SPC_UNEXPECTED_IN_QUEUE, 1); + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm_ptr, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); - SPC_TIMER_STOP(OMPI_SPC_MATCH_TIME, &timer); return NULL; } while(true); } @@ -955,6 +989,12 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl, */ OB1_MATCHING_LOCK(&comm->matching_lock); +#if SPC_ENABLE == 1 + opal_timer_t timer; + timer = 0; +#endif + SPC_TIMER_START(OMPI_SPC_OOS_MATCH_QUEUE_TIME, &timer); + frag_msg_seq = hdr->hdr_seq; next_msg_seq_expected = (uint16_t)proc->expected_sequence; @@ -965,15 +1005,23 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl, MCA_PML_OB1_RECV_FRAG_ALLOC(frag); MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl); append_frag_to_ordered_list(&proc->frags_cant_match, frag, next_msg_seq_expected); + SPC_TIMER_STOP(OMPI_SPC_OOS_MATCH_QUEUE_TIME, &timer); +#if SPC_ENABLE == 1 + size_t total_data = segments[0].seg_len; + for(int i = 1; i < num_segments; i++) { + total_data += segments[i].seg_len; + } + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_QUEUE_DATA, OMPI_SPC_OOS_QUEUE_DATA, total_data); +#endif SPC_RECORD(OMPI_SPC_OUT_OF_SEQUENCE, 1); - SPC_RECORD(OMPI_SPC_OOS_IN_QUEUE, 1); - SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE); + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_OOS_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE, 1); OB1_MATCHING_UNLOCK(&comm->matching_lock); return OMPI_SUCCESS; } } + SPC_TIMER_STOP(OMPI_SPC_OOS_MATCH_QUEUE_TIME, &timer); /* mca_pml_ob1_recv_frag_match_proc() will release the lock. */ return mca_pml_ob1_recv_frag_match_proc(btl, comm_ptr, proc, hdr, @@ -1057,7 +1105,13 @@ mca_pml_ob1_recv_frag_match_proc( mca_btl_base_module_t *btl, */ if(OPAL_UNLIKELY(NULL != proc->frags_cant_match)) { OB1_MATCHING_LOCK(&comm->matching_lock); +#if SPC_ENABLE == 1 + opal_timer_t timer; + timer = 0; +#endif + SPC_TIMER_START(OMPI_SPC_OOS_MATCH_TIME, &timer); if((frag = check_cantmatch_for_match(proc))) { + SPC_TIMER_STOP(OMPI_SPC_OOS_MATCH_TIME, &timer); hdr = &frag->hdr.hdr_match; segments = frag->segments; num_segments = frag->num_segments; @@ -1065,6 +1119,7 @@ mca_pml_ob1_recv_frag_match_proc( mca_btl_base_module_t *btl, type = hdr->hdr_common.hdr_type; goto match_this_frag; } + SPC_TIMER_STOP(OMPI_SPC_OOS_MATCH_TIME, &timer); OB1_MATCHING_UNLOCK(&comm->matching_lock); } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.h b/ompi/mca/pml/ob1/pml_ob1_recvfrag.h index def120ccc62..53b813cbaeb 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2018 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -28,6 +28,7 @@ #define MCA_PML_OB1_RECVFRAG_H #include "pml_ob1_hdr.h" +#include "ompi/runtime/ompi_spc.h" BEGIN_C_DECLS @@ -79,6 +80,8 @@ do { \ macro_segments[0].seg_addr.pval = frag->addr; \ } else { \ buffers[0].len = _size; \ + SPC_UPDATE_WATERMARK(OMPI_SPC_MAX_QUEUE_ALLOCATION, \ + OMPI_SPC_QUEUE_ALLOCATION, _size); \ buffers[0].addr = (char*) \ mca_pml_ob1.allocator->alc_alloc( mca_pml_ob1.allocator, \ buffers[0].len, \ @@ -98,6 +101,7 @@ do { \ do { \ if( frag->segments[0].seg_len > mca_pml_ob1.unexpected_limit ) { \ /* return buffers */ \ + SPC_RECORD(OMPI_SPC_QUEUE_ALLOCATION, -frag->buffers[0].len); \ mca_pml_ob1.allocator->alc_free( mca_pml_ob1.allocator, \ frag->buffers[0].addr ); \ } \ diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 8588b9a5879..0e5288a4135 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2019 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, @@ -1331,6 +1331,13 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) #else opal_list_remove_item(&proc->unexpected_frags, (opal_list_item_t*)frag); +#endif +#if SPC_ENABLE == 1 + size_t total_data = frag->segments[0].seg_len; + for(int i = 1; i < frag->num_segments; i++) { + total_data += frag->segments[i].seg_len; + } + SPC_RECORD(OMPI_SPC_UNEXPECTED_QUEUE_DATA, -total_data); #endif SPC_RECORD(OMPI_SPC_UNEXPECTED_IN_QUEUE, -1); OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); @@ -1367,6 +1374,13 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) #else opal_list_remove_item(&proc->unexpected_frags, (opal_list_item_t*)frag); +#endif +#if SPC_ENABLE == 1 + size_t total_data = frag->segments[0].seg_len; + for(int i = 1; i < frag->num_segments; i++) { + total_data += frag->segments[i].seg_len; + } + SPC_RECORD(OMPI_SPC_UNEXPECTED_QUEUE_DATA, -total_data); #endif SPC_RECORD(OMPI_SPC_UNEXPECTED_IN_QUEUE, -1); OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 88e7f7252c3..30cc48aaadb 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2019 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, @@ -644,9 +644,10 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq, /* send */ rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_MATCH); - SPC_USER_OR_MPI(sendreq->req_send.req_base.req_ompi.req_status.MPI_TAG, (ompi_spc_value_t)size, - OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI); + if( OPAL_LIKELY( rc >= OPAL_SUCCESS ) ) { + SPC_USER_OR_MPI(sendreq->req_send.req_base.req_tag, (ompi_spc_value_t)size, + OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI); if( OPAL_LIKELY( 1 == rc ) ) { mca_pml_ob1_match_completion_free_request( bml_btl, sendreq ); } diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index 4c52c317003..e5fa56951d1 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -34,6 +34,7 @@ #include "pml_ob1_rdmafrag.h" #include "ompi/mca/bml/bml.h" #include "ompi/memchecker.h" +#include "ompi/runtime/ompi_spc.h" BEGIN_C_DECLS @@ -396,7 +397,9 @@ mca_pml_ob1_send_request_start_btl( mca_pml_ob1_send_request_t* sendreq, } #endif /* OPAL_CUDA_GDR_SUPPORT */ + SPC_BIN_RECORD(OMPI_SPC_P2P_MESSAGE_SIZE, size); if( OPAL_LIKELY(size <= eager_limit) ) { + SPC_RECORD(OMPI_SPC_EAGER_MESSAGES, 1); switch(sendreq->req_send.req_send_mode) { case MCA_PML_BASE_SEND_SYNCHRONOUS: rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0); @@ -416,6 +419,7 @@ mca_pml_ob1_send_request_start_btl( mca_pml_ob1_send_request_t* sendreq, break; } } else { + SPC_RECORD(OMPI_SPC_NOT_EAGER_MESSAGES, 1); size = eager_limit; if(OPAL_UNLIKELY(btl->btl_rndv_eager_limit < eager_limit)) size = btl->btl_rndv_eager_limit; diff --git a/ompi/runtime/help-mpi-runtime.txt b/ompi/runtime/help-mpi-runtime.txt index 1fcb93a35e9..621fe6995ec 100644 --- a/ompi/runtime/help-mpi-runtime.txt +++ b/ompi/runtime/help-mpi-runtime.txt @@ -3,7 +3,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University +# Copyright (c) 2004-2020 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -120,6 +120,7 @@ There was an error registering software performance counters (SPCs) as MPI_T performance variables. Your job will continue, but SPCs will be disabled for MPI_T. # + [no-pmi] PMIx_Init failed for the following reason: @@ -128,3 +129,26 @@ PMIx_Init failed for the following reason: Open MPI requires access to a local PMIx server to execute. Please ensure that either you are operating in a PMIx-enabled environment, or use "mpirun" to execute the job. + +[spc: default shared memory directory failed] +Failed to access the default shared memory directory, falling back to storing the shared memory +file in '%s'. +# +[spc: filename creation failure] +Failed to create an appropriate filename for use in storing data in a +shared memory file. +# +[spc: shm segment creation failure] +Failed to create a shared memory segment. +# +[spc: shm atttach failure] +Failed to attach to created shared memory segment. +# +[spc: shm file open failure] +Failed to open shared memory file: '%s'. +# +[spc: mmap failure] +Failed to mmap the shared memory file. mmap had the following error: +'%s'. You will be unable to read counters through attaching to the +shared memory file. + diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index 07d0ad7b32c..500152508fd 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2018 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -75,9 +75,6 @@ bool ompi_async_mpi_finalize = false; uint32_t ompi_add_procs_cutoff = OMPI_ADD_PROCS_CUTOFF_DEFAULT; bool ompi_mpi_dynamics_enabled = true; -char *ompi_mpi_spc_attach_string = NULL; -bool ompi_mpi_spc_dump_enabled = false; - static bool show_default_mca_params = false; static bool show_file_mca_params = false; static bool show_enviro_mca_params = false; @@ -324,21 +321,9 @@ int ompi_mpi_register_params(void) MCA_BASE_VAR_SYN_FLAG_DEPRECATED); } - ompi_mpi_spc_attach_string = NULL; - (void) mca_base_var_register("ompi", "mpi", NULL, "spc_attach", - "A comma delimeted string listing the software-based performance counters (SPCs) to enable.", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_4, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_mpi_spc_attach_string); - - ompi_mpi_spc_dump_enabled = false; - (void) mca_base_var_register("ompi", "mpi", NULL, "spc_dump_enabled", - "A boolean value for whether (true) or not (false) to enable dumping SPC counters in MPI_Finalize.", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_4, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_mpi_spc_dump_enabled); +#if SPC_ENABLE == 1 + (void) ompi_spc_register_params(); +#endif return OMPI_SUCCESS; } diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c index 654dd158b3e..53edc1decde 100644 --- a/ompi/runtime/ompi_spc.c +++ b/ompi/runtime/ompi_spc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 The University of Tennessee and The University + * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -17,21 +17,38 @@ #include "ompi_spc.h" -opal_timer_t sys_clock_freq_mhz = 0; - -static void ompi_spc_dump(void); - -/* Array for converting from SPC indices to MPI_T indices */ +static opal_timer_t sys_clock_freq_mhz = 0; +static int mpi_t_offset = -1; static bool mpi_t_enabled = false; +static bool spc_enabled = true; +static bool need_free = false; +static bool mmap_failed = false; static ompi_communicator_t *ompi_spc_comm = NULL; +/* MCA Parameter Variables */ +char *ompi_mpi_spc_attach_string = NULL; +char *ompi_mpi_spc_xml_string = NULL; +bool ompi_mpi_spc_dump_enabled = false; +bool ompi_mpi_spc_mmap_enabled = false; +int ompi_mpi_spc_snapshot_period = 0; +int ompi_mpi_spc_p2p_message_boundary = 12288; +int ompi_mpi_spc_collective_message_boundary = 12288; +int ompi_mpi_spc_collective_comm_boundary = 64; + typedef struct ompi_spc_event_t { const char* counter_name; const char* counter_description; } ompi_spc_event_t; +static void ompi_spc_dump(void); + #define SET_COUNTER_ARRAY(NAME, DESC) [NAME] = { .counter_name = #NAME, .counter_description = DESC } +/* STEP 2: Add the counter descriptions for new counters here along with their + * enumeration to be converted to a name value. NOTE: The names and + * descriptions MUST be in the same array location as where you added + * the counter name in the ompi_spc_counters_t enumeration! + */ static ompi_spc_event_t ompi_spc_events_names[OMPI_SPC_NUM_COUNTERS] = { SET_COUNTER_ARRAY(OMPI_SPC_SEND, "The number of times MPI_Send was called."), SET_COUNTER_ARRAY(OMPI_SPC_BSEND, "The number of times MPI_Bsend was called."), @@ -136,29 +153,99 @@ static ompi_spc_event_t ompi_spc_events_names[OMPI_SPC_NUM_COUNTERS] = { SET_COUNTER_ARRAY(OMPI_SPC_BYTES_GET, "The number of bytes sent/received using RMA Get operations both through user-level Get functions and internal Get functions."), SET_COUNTER_ARRAY(OMPI_SPC_UNEXPECTED, "The number of messages that arrived as unexpected messages."), SET_COUNTER_ARRAY(OMPI_SPC_OUT_OF_SEQUENCE, "The number of messages that arrived out of the proper sequence."), - SET_COUNTER_ARRAY(OMPI_SPC_MATCH_TIME, "The number of microseconds spent matching unexpected messages. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), + SET_COUNTER_ARRAY(OMPI_SPC_OOS_QUEUE_HOPS, "The number of times we jumped to the next element in the out of sequence message queue's ordered list."), + SET_COUNTER_ARRAY(OMPI_SPC_MATCH_TIME, "The amount of time (MPI_T reports microseconds) spent matching unexpected messages. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), + SET_COUNTER_ARRAY(OMPI_SPC_MATCH_QUEUE_TIME, "The amount of time (MPI_T reports microseconds, stored in cycles) spent inserting unexpected messages into the unexpected message queue. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), + SET_COUNTER_ARRAY(OMPI_SPC_OOS_MATCH_TIME, "The amount of time (MPI_T reports microseconds, stored in cycles) spent matching out-of-sequence messages. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), + SET_COUNTER_ARRAY(OMPI_SPC_OOS_MATCH_QUEUE_TIME, "The amount of time (MPI_T reports microseconds, stored in cycles) spent inserting out-of-sequence messages into the unexpected message queue. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), SET_COUNTER_ARRAY(OMPI_SPC_UNEXPECTED_IN_QUEUE, "The number of messages that are currently in the unexpected message queue(s) of an MPI process."), SET_COUNTER_ARRAY(OMPI_SPC_OOS_IN_QUEUE, "The number of messages that are currently in the out of sequence message queue(s) of an MPI process."), - SET_COUNTER_ARRAY(OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, "The maximum number of messages that the unexpected message queue(s) within an MPI process " - "contained at once since the last reset of this counter. Note: This counter is reset each time it is read."), - SET_COUNTER_ARRAY(OMPI_SPC_MAX_OOS_IN_QUEUE, "The maximum number of messages that the out of sequence message queue(s) within an MPI process " - "contained at once since the last reset of this counter. Note: This counter is reset each time it is read.") + SET_COUNTER_ARRAY(OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, "The maximum number of messages that the unexpected message queue(s) within an MPI process contained at once since the last reset of this counter. Note: This counter is reset each time it is read. Note: The OMPI_SPC_UNEXPECTED_IN_QUEUE counter must also be activated."), + SET_COUNTER_ARRAY(OMPI_SPC_MAX_OOS_IN_QUEUE, "The maximum number of messages that the out of sequence message queue(s) within an MPI process contained at once since the last reset of this counter. Note: This counter is reset each time it is read. Note: The OMPI_SPC_OOS_IN_QUEUE counter must also be activated."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_LINEAR, "The number of times the base broadcast used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_CHAIN, "The number of times the base broadcast used the chain algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_PIPELINE, "The number of times the base broadcast used the pipeline algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_SPLIT_BINTREE, "The number of times the base broadcast used the split binary tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_BINTREE, "The number of times the base broadcast used the binary tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BCAST_BINOMIAL, "The number of times the base broadcast used the binomial algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_CHAIN, "The number of times the base reduce used the chain algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_PIPELINE, "The number of times the base reduce used the pipeline algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_BINARY, "The number of times the base reduce used the binary tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_BINOMIAL, "The number of times the base reduce used the binomial tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_IN_ORDER_BINTREE, "The number of times the base reduce used the in order binary tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_LINEAR, "The number of times the base reduce used the basic linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_SCATTER_NONOVERLAPPING, "The number of times the base reduce scatter used the nonoverlapping algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_SCATTER_RECURSIVE_HALVING, "The number of times the base reduce scatter used the recursive halving algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_REDUCE_SCATTER_RING, "The number of times the base reduce scatter used the ring algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLREDUCE_NONOVERLAPPING, "The number of times the base allreduce used the nonoverlapping algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLREDUCE_RECURSIVE_DOUBLING, "The number of times the base allreduce used the recursive doubling algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLREDUCE_RING, "The number of times the base allreduce used the ring algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLREDUCE_RING_SEGMENTED, "The number of times the base allreduce used the segmented ring algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLREDUCE_LINEAR, "The number of times the base allreduce used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_SCATTER_BINOMIAL, "The number of times the base scatter used the binomial tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_SCATTER_LINEAR, "The number of times the base scatter used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_GATHER_BINOMIAL, "The number of times the base gather used the binomial tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_GATHER_LINEAR_SYNC, "The number of times the base gather used the synchronous linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_GATHER_LINEAR, "The number of times the base gather used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLTOALL_INPLACE, "The number of times the base alltoall used the in-place algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLTOALL_PAIRWISE, "The number of times the base alltoall used the pairwise algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLTOALL_BRUCK, "The number of times the base alltoall used the bruck algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLTOALL_LINEAR_SYNC, "The number of times the base alltoall used the synchronous linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLTOALL_TWO_PROCS, "The number of times the base alltoall used the two process algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLTOALL_LINEAR, "The number of times the base alltoall used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLGATHER_BRUCK, "The number of times the base allgather used the bruck algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLGATHER_RECURSIVE_DOUBLING, "The number of times the base allgather used the recursive doubling algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLGATHER_RING, "The number of times the base allgather used the ring algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLGATHER_NEIGHBOR_EXCHANGE, "The number of times the base allgather used the neighbor exchange algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLGATHER_TWO_PROCS, "The number of times the base allgather used the two process algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_ALLGATHER_LINEAR, "The number of times the base allgather used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_DOUBLE_RING, "The number of times the base barrier used the double ring algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_RECURSIVE_DOUBLING, "The number of times the base barrier used the recursive doubling algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_BRUCK, "The number of times the base barrier used the bruck algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_TWO_PROCS, "The number of times the base barrier used the two process algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_LINEAR, "The number of times the base barrier used the linear algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_BASE_BARRIER_TREE, "The number of times the base barrier used the tree algorithm."), + SET_COUNTER_ARRAY(OMPI_SPC_P2P_MESSAGE_SIZE, "This is a bin counter with two subcounters. The first is messages that are less than or equal to mpi_spc_p2p_message_boundary bytes and the second is those that are larger than mpi_spc_p2p_message_boundary bytes."), + SET_COUNTER_ARRAY(OMPI_SPC_EAGER_MESSAGES, "The number of messages that fall within the eager size."), + SET_COUNTER_ARRAY(OMPI_SPC_NOT_EAGER_MESSAGES, "The number of messages that do not fall within the eager size."), + SET_COUNTER_ARRAY(OMPI_SPC_QUEUE_ALLOCATION, "The amount of memory allocated after runtime currently in use for temporary message queues like the unexpected message queue and the out of sequence message queue."), + SET_COUNTER_ARRAY(OMPI_SPC_MAX_QUEUE_ALLOCATION, "The maximum amount of memory allocated after runtime at one point for temporary message queues like the unexpected message queue and the out of sequence message queue. Note: The OMPI_SPC_QUEUE_ALLOCATION counter must also be activated."), + SET_COUNTER_ARRAY(OMPI_SPC_UNEXPECTED_QUEUE_DATA, "The amount of memory currently in use for the unexpected message queue."), + SET_COUNTER_ARRAY(OMPI_SPC_MAX_UNEXPECTED_QUEUE_DATA, "The maximum amount of memory in use for the unexpected message queue. Note: The OMPI_SPC_UNEXPECTED_QUEUE_DATA counter must also be activated."), + SET_COUNTER_ARRAY(OMPI_SPC_OOS_QUEUE_DATA, "The amount of memory currently in use for the out-of-sequence message queue."), + SET_COUNTER_ARRAY(OMPI_SPC_MAX_OOS_QUEUE_DATA, "The maximum amount of memory in use for the out-of-sequence message queue. Note: The OMPI_SPC_OOS_QUEUE_DATA counter must also be activated.") }; -/* An array of integer values to denote whether an event is activated (1) or not (0) */ -static uint32_t ompi_spc_attached_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; -/* An array of integer values to denote whether an event is timer-based (1) or not (0) */ +/* A bitmap to denote whether an event is activated (1) or not (0) + * This is not static beacuse it is needed in the recording macros + * for instrumentation. + */ +OMPI_DECLSPEC uint32_t ompi_spc_attached_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; +/* A bitmap to denote whether an event is timer-based (1) or not (0) */ static uint32_t ompi_spc_timer_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; -/* An array of event structures to store the event data (name and value) */ -static ompi_spc_t *ompi_spc_events = NULL; +/* A bitmap to denote whether an event is bin-based (1) or not (0) */ +static uint32_t ompi_spc_bin_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; +/* A bitmap to denote whether an event is collective bin-based (1) or not (0) */ +static uint32_t ompi_spc_collective_bin_event[OMPI_SPC_NUM_COUNTERS / sizeof(uint32_t)] = { 0 }; + +/* A contiguous data structure for storing the counter values, rules, and bins */ +void *ompi_spc_events = NULL; +/* An array of offset structures for indexing into the ompi_spc_events data */ +static ompi_spc_offset_t ompi_spc_offsets[OMPI_SPC_NUM_COUNTERS] = {-1}; +/* A pointer into the ompi_spc_events data for the SPC values */ +static ompi_spc_value_t *ompi_spc_values = NULL; +/* ############################################################## + * ################## SPC Bitmap Functions ###################### + * ############################################################## + */ static inline void SET_SPC_BIT(uint32_t* array, int32_t pos) { assert(pos < OMPI_SPC_NUM_COUNTERS); array[pos / (8 * sizeof(uint32_t))] |= (1U << (pos % (8 * sizeof(uint32_t)))); } -static inline bool IS_SPC_BIT_SET(uint32_t* array, int32_t pos) +inline bool IS_SPC_BIT_SET(uint32_t* array, int32_t pos) { assert(pos < OMPI_SPC_NUM_COUNTERS); return !!(array[pos / (8 * sizeof(uint32_t))] & (1U << (pos % (8 * sizeof(uint32_t))))); @@ -170,6 +257,70 @@ static inline void CLEAR_SPC_BIT(uint32_t* array, int32_t pos) array[pos / (8 * sizeof(uint32_t))] &= ~(1U << (pos % (8 * sizeof(uint32_t)))); } +/* Registers all of the SPC MCA parameter variables. If SPCs are enabled, this is called from + * ompi_mpi_params.c in its registration function. + */ +int ompi_spc_register_params() +{ + ompi_mpi_spc_attach_string = NULL; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_attach", + "A comma delimeted string listing the software-based performance counters (SPCs) to enable.", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_attach_string); + + ompi_mpi_spc_xml_string = NULL; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_xml_string", + "A string to add to SPC XML files for easier identification. The format will be: spc_data.[nodename].[jobid or spc_xml_string].[world_rank].xml", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_xml_string); + + ompi_mpi_spc_dump_enabled = false; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_dump_enabled", + "A boolean value for whether (true) or not (false) to enable dumping SPC counters in MPI_Finalize.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_dump_enabled); + + ompi_mpi_spc_mmap_enabled = false; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_mmap_enabled", + "A boolean value for whether (true) or not (false) to enable dumping SPC counters to an mmap'd file.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_mmap_enabled); + + ompi_mpi_spc_p2p_message_boundary = 12288; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_p2p_message_boundary", + "An integer value for determining the boundary for whether a message is small/large for point to point message size bin counter (<= this value is small).", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_p2p_message_boundary); + + ompi_mpi_spc_collective_message_boundary = 12288; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_collective_message_boundary", + "An integer value for determining the boundary for whether a message is small/large for collective bin counters (<= this value is small).", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_collective_message_boundary); + + ompi_mpi_spc_collective_comm_boundary = 64; + (void) mca_base_var_register("ompi", "mpi", NULL, "spc_collective_comm_boundary", + "An integer value for determining the boundary for whether a communicator is small/large for collective bin counters (<= this value is small).", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_spc_collective_comm_boundary); + + return OMPI_SUCCESS; +} + /* ############################################################## * ################# Begin MPI_T Functions ###################### * ############################################################## @@ -188,85 +339,512 @@ static int ompi_spc_notify(mca_base_pvar_t *pvar, mca_base_pvar_event_t event, v index = (int)(uintptr_t)pvar->ctx; /* Convert from MPI_T pvar index to SPC index */ /* For this event, we need to set count to the number of long long type - * values for this counter. All SPC counters are one long long, so we - * always set count to 1. + * values for this counter. Most SPC counters are one long long so the + * default is 1, however bin counters and the xml string can be longer. */ - if(MCA_BASE_PVAR_HANDLE_BIND == event) { - *count = 1; + do { + if(MCA_BASE_PVAR_HANDLE_BIND == event) { + /* Convert from MPI_T pvar index to SPC index */ + index = pvar->pvar_index - mpi_t_offset; + if(index < 0) { + char *shm_dir; + if(0 == access(SPC_SHM_DIR, W_OK)) { + shm_dir = SPC_SHM_DIR; + } else { + opal_show_help("help-mpi-runtime.txt", "spc: default shared memory directory failed", true, opal_process_info.job_session_dir); + shm_dir = opal_process_info.job_session_dir; + } + + int rank = ompi_comm_rank(ompi_spc_comm), rc; + char filename[SPC_MAX_FILENAME]; + + if(ompi_mpi_spc_xml_string == NULL) { + rc = snprintf(filename, SPC_MAX_FILENAME, "%s" OPAL_PATH_SEP "spc_data.%s.%d.%d.xml", shm_dir, + opal_process_info.nodename, OPAL_PROC_MY_NAME.jobid, rank); + } else { + rc = snprintf(filename, SPC_MAX_FILENAME, "%s" OPAL_PATH_SEP "spc_data.%s.%s.%d.xml", shm_dir, + opal_process_info.nodename, ompi_mpi_spc_xml_string, rank); + } + + *count = strlen(filename); + break; + } + if( IS_SPC_BIT_SET(ompi_spc_bin_event, index) || IS_SPC_BIT_SET(ompi_spc_collective_bin_event, index) ) { + *count = *(int*)(ompi_spc_events+ompi_spc_offsets[index].rules_offset); + } else { + *count = 1; + } + } + /* For this event, we need to turn on the counter */ + else if(MCA_BASE_PVAR_HANDLE_START == event) { + /* Convert from MPI_T pvar index to SPC index */ + index = pvar->pvar_index - mpi_t_offset; + if(index > 0) { + SET_SPC_BIT(ompi_spc_attached_event, index); + } + } + /* For this event, we need to turn off the counter */ + else if(MCA_BASE_PVAR_HANDLE_STOP == event) { + /* Convert from MPI_T pvar index to SPC index */ + index = pvar->pvar_index - mpi_t_offset; + if(index > 0) { + CLEAR_SPC_BIT(ompi_spc_attached_event, index); + } + } + } while(0); + + return MPI_SUCCESS; +} + +static int ompi_spc_get_xml_filename(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) + __opal_attribute_unused__; + +static int ompi_spc_get_xml_filename(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) +{ + int rc; + char **filename, *shm_dir; + + if(OPAL_LIKELY(!mpi_t_enabled)) { + filename = (char**)value; + rc = sprintf(*filename, ""); + + return MPI_SUCCESS; } - /* For this event, we need to turn on the counter */ - else if(MCA_BASE_PVAR_HANDLE_START == event) { - SET_SPC_BIT(ompi_spc_attached_event, index); + + if(0 == access(SPC_SHM_DIR, W_OK)) { + shm_dir = SPC_SHM_DIR; + } else { + opal_show_help("help-mpi-runtime.txt", "spc: default shared memory directory failed", true, opal_process_info.job_session_dir); + shm_dir = opal_process_info.job_session_dir; } - /* For this event, we need to turn off the counter */ - else if(MCA_BASE_PVAR_HANDLE_STOP == event) { - CLEAR_SPC_BIT(ompi_spc_attached_event, index); + + int rank = ompi_comm_rank(ompi_spc_comm); + + filename = (char**)value; + if(ompi_mpi_spc_xml_string == NULL) { + rc = sprintf(*filename, "%s" OPAL_PATH_SEP "spc_data.%s.%d.%d.xml", shm_dir, + opal_process_info.nodename, OPAL_PROC_MY_NAME.jobid, rank); + } else { + rc = sprintf(*filename, "%s" OPAL_PATH_SEP "spc_data.%s.%s.%d.xml", shm_dir, + opal_process_info.nodename, ompi_mpi_spc_xml_string, rank); } return MPI_SUCCESS; } -/* ############################################################## - * ################# Begin SPC Functions ######################## - * ############################################################## - */ - /* This function returns the current count of an SPC counter that has been retistered * as an MPI_T pvar. The MPI_T index is not necessarily the same as the SPC index, * so we need to convert from MPI_T index to SPC index and then set the 'value' argument - * to the correct value for this pvar. + * to the correct value for this pvar. Watermark counters are also reset here. */ static int ompi_spc_get_count(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) __opal_attribute_unused__; static int ompi_spc_get_count(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) { - long long *counter_value = (long long*)value; - if(OPAL_LIKELY(!mpi_t_enabled)) { + long long *counter_value = (long long*)value; *counter_value = 0; return MPI_SUCCESS; } /* Convert from MPI_T pvar index to SPC index */ - int index = (int)(uintptr_t)pvar->ctx; + int index = pvar->pvar_index - mpi_t_offset; + + /* If this is a bin-based counter, set 'value' to the array of bin values */ + if( IS_SPC_BIT_SET(ompi_spc_bin_event, index) || IS_SPC_BIT_SET(ompi_spc_collective_bin_event, index) ) { + long long *bin_value = (long long*)value; + int count = ((int*)(ompi_spc_events+ompi_spc_offsets[index].rules_offset))[0]; + for(int i = 0; i < count; i++) { + bin_value[i] = ((long long*)(ompi_spc_events+ompi_spc_offsets[index].bins_offset))[i]; + } + return MPI_SUCCESS; + } + + long long *counter_value = (long long*)value; /* Set the counter value to the current SPC value */ - *counter_value = (long long)ompi_spc_events[index].value; + *counter_value = ompi_spc_values[index]; + /* If this is a timer-based counter, convert from cycles to microseconds */ if( IS_SPC_BIT_SET(ompi_spc_timer_event, index) ) { *counter_value /= sys_clock_freq_mhz; } - /* If this is a high watermark counter, reset it after it has been read */ - if(index == OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE || index == OMPI_SPC_MAX_OOS_IN_QUEUE) { - ompi_spc_events[index].value = 0; + /* STEP 4: If this is a high watermark counter, reset it after it has been read. + * Be sure to reset the counter to the current value of the counter it + * is tracking. + */ + if(index == OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE) { + ompi_spc_values[index] = ompi_spc_values[OMPI_SPC_UNEXPECTED_IN_QUEUE]; + } + if(index == OMPI_SPC_MAX_OOS_IN_QUEUE) { + ompi_spc_values[index] = ompi_spc_values[OMPI_SPC_OOS_IN_QUEUE]; + } + if(index == OMPI_SPC_MAX_QUEUE_ALLOCATION) { + ompi_spc_values[index] = ompi_spc_values[OMPI_SPC_QUEUE_ALLOCATION]; + } + if(index == OMPI_SPC_MAX_UNEXPECTED_QUEUE_DATA) { + ompi_spc_values[index] = ompi_spc_values[OMPI_SPC_UNEXPECTED_QUEUE_DATA]; + } + if(index == OMPI_SPC_MAX_OOS_QUEUE_DATA) { + ompi_spc_values[index] = ompi_spc_values[OMPI_SPC_OOS_QUEUE_DATA]; } return MPI_SUCCESS; } +/* ############################################################## + * ################# Begin SPC Functions ######################## + * ############################################################## + */ + /* Initializes the events data structure and allocates memory for it if needed. */ void ompi_spc_events_init(void) { - int i; + ompi_comm_dup(&ompi_mpi_comm_world.comm, &ompi_spc_comm); + + int i, value_offset = 0, bin_offset = OMPI_SPC_NUM_COUNTERS*sizeof(ompi_spc_value_t), rank = ompi_comm_rank(ompi_spc_comm), shm_fd, rc, ret, mod; + char filename[SPC_MAX_FILENAME], *shm_dir; + void *ptr; + + /* Make sure the bin offset is cache aligned to avoid false sharing */ + mod = bin_offset % SPC_CACHE_LINE; + if(mod != 0) { + bin_offset += SPC_CACHE_LINE - mod; + } + + FILE *fptr, *shm_fptr = NULL; + char sm_file[SPC_MAX_FILENAME], *my_segment; + opal_shmem_ds_t shm_ds; + + if(ompi_mpi_spc_mmap_enabled) { + /* Determine the location for saving the shared memory file */ + if(0 == access(SPC_SHM_DIR, W_OK)) { + shm_dir = SPC_SHM_DIR; + } else { + opal_show_help("help-mpi-runtime.txt", "spc: default shared memory directory failed", true, opal_process_info.job_session_dir); + shm_dir = opal_process_info.job_session_dir; + } + + /* Create a shared memory file */ + + rc = snprintf(sm_file, SPC_MAX_FILENAME, "%s" OPAL_PATH_SEP "spc_data.%s.%d.%d", shm_dir, + opal_process_info.nodename, OPAL_PROC_MY_NAME.jobid, rank); + + if (0 > rc) { + opal_show_help("help-mpi-runtime.txt", "spc: filename creation failure", true); + } + + if(ompi_mpi_spc_xml_string == NULL) { + rc = snprintf(filename, SPC_MAX_FILENAME, "%s" OPAL_PATH_SEP "spc_data.%s.%d.%d.xml", shm_dir, + opal_process_info.nodename, OPAL_PROC_MY_NAME.jobid, rank); + } else { + rc = snprintf(filename, SPC_MAX_FILENAME, "%s" OPAL_PATH_SEP "spc_data.%s.%s.%d.xml", shm_dir, + opal_process_info.nodename, ompi_mpi_spc_xml_string, rank); + } + if (0 > rc) { + opal_show_help("help-mpi-runtime.txt", "spc: filename creation failure", true); + } + fptr = fopen(filename, "w+"); + + /* Registers the name/path of the XML file as an MPI_T pvar */ + ret = mca_base_pvar_register("ompi", "runtime", "spc", "OMPI_SPC_XML_FILE", "The filename for the SPC XML file for using the mmap interface.", + OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_GENERIC, + MCA_BASE_VAR_TYPE_STRING, NULL, MPI_T_BIND_NO_OBJECT, + MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, + ompi_spc_get_xml_filename, NULL, ompi_spc_notify, NULL); + if(ret < 0) { + opal_output(0, "There was an error registering an MPI_T pvar -> %s\n", opal_strerror(ret)); + } + + + fprintf(fptr, "\n"); + fprintf(fptr, "\n"); + } + + /* STEP 3: Add specialized counter enumerations to the appropriate bitmap here. + * + */ + /* ######################################################################## + * ################## Add Timer Based Counter Enums Here ################## + * ######################################################################## + */ + + SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_MATCH_TIME); + SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_MATCH_QUEUE_TIME); + SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_OOS_MATCH_TIME); + SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_OOS_MATCH_QUEUE_TIME); + + /* ############################################################################### + * ###################### Put Bin Counter Sizes Here ############################# + * ############################################################################### + */ + int data_size = OMPI_SPC_NUM_COUNTERS * sizeof(ompi_spc_value_t); + + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_P2P_MESSAGE_SIZE); + ompi_spc_offsets[OMPI_SPC_P2P_MESSAGE_SIZE].num_bins = 2; + data_size += 2 * (sizeof(int) + sizeof(ompi_spc_value_t)); + + /* ######################################################################## + * ############## Add Collective Bin-Based Counter Enums Here ############# + * ######################################################################## + */ + /* For each collective bin counter we must set the bitmap bit, allocate memory for the arrays and populate the bin_rules array */ + + /* Allgather Algorithms */ + /* Collective bin counter for the Bruck Allgather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLGATHER_BRUCK); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLGATHER_BRUCK); + /* Collective bin counter for the Recursive Doubling Allgather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLGATHER_RECURSIVE_DOUBLING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLGATHER_RECURSIVE_DOUBLING); + /* Collective bin counter for the Ring Allgather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLGATHER_RING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLGATHER_RING); + /* Collective bin counter for the Neighbor Exchange Allgather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLGATHER_NEIGHBOR_EXCHANGE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLGATHER_NEIGHBOR_EXCHANGE); + /* Collective bin counter for the Two Process Allgather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLGATHER_TWO_PROCS); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLGATHER_TWO_PROCS); + /* Collective bin counter for the Linear Allgather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLGATHER_LINEAR); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLGATHER_LINEAR); + + /* Allreduce Algorithms */ + /* Collective bin counter for the Nonoverlapping Allreduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLREDUCE_NONOVERLAPPING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLREDUCE_NONOVERLAPPING); + /* Collective bin counter for the Recursive Doubling Allreduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLREDUCE_RECURSIVE_DOUBLING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLREDUCE_RECURSIVE_DOUBLING); + /* Collective bin counter for the Ring Allreduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLREDUCE_RING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLREDUCE_RING); + /* Collective bin counter for the Segmented Ring Allreduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLREDUCE_RING_SEGMENTED); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLREDUCE_RING_SEGMENTED); + /* Collective bin counter for the Linear Allreduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLREDUCE_LINEAR); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLREDUCE_LINEAR); - /* If the events data structure hasn't been allocated yet, allocate memory for it */ - if(NULL == ompi_spc_events) { - ompi_spc_events = (ompi_spc_t*)malloc(OMPI_SPC_NUM_COUNTERS * sizeof(ompi_spc_t)); - if(ompi_spc_events == NULL) { + /* All-to-All Algorithms */ + /* Collective bin counter for the Inplace Alltoall algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLTOALL_INPLACE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLTOALL_INPLACE); + /* Collective bin counter for the Pairwise Alltoall algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLTOALL_PAIRWISE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLTOALL_PAIRWISE); + /* Collective bin counter for the Bruck Alltoall algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLTOALL_BRUCK); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLTOALL_BRUCK); + /* Collective bin counter for the Linear Sync Alltoall algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLTOALL_LINEAR_SYNC); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLTOALL_LINEAR_SYNC); + /* Collective bin counter for the Two Process Alltoall algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLTOALL_TWO_PROCS); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLTOALL_TWO_PROCS); + /* Collective bin counter for the Linear Alltoall algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_ALLTOALL_LINEAR); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_ALLTOALL_LINEAR); + + /* Broadcast Algorithms */ + /* Collective bin counter for the Chain Broadcast algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_BCAST_CHAIN); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_BCAST_CHAIN); + /* Collective bin counter for the Binomial Broadcast algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_BCAST_BINOMIAL); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_BCAST_BINOMIAL); + /* Collective bin counter for the Pipeline Broadcast algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_BCAST_PIPELINE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_BCAST_PIPELINE); + /* Collective bin counter for the Split Binary Tree Broadcast algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_BCAST_SPLIT_BINTREE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_BCAST_SPLIT_BINTREE); + /* Collective bin counter for the Binary Tree Broadcast algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_BCAST_BINTREE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_BCAST_BINTREE); + + /* Gather Algorithms */ + /* Collective bin counter for the Binomial Gather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_GATHER_BINOMIAL); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_GATHER_BINOMIAL); + /* Collective bin counter for the Linear Sync Gather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_GATHER_LINEAR_SYNC); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_GATHER_LINEAR_SYNC); + /* Collective bin counter for the Linear Gather algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_GATHER_LINEAR); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_GATHER_LINEAR); + + /* Reduce Algorithms */ + /* Collective bin counter for the Chain Reduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_CHAIN); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_CHAIN); + /* Collective bin counter for the Pipeline Reduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_PIPELINE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_PIPELINE); + /* Collective bin counter for the Binary Reduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_BINARY); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_BINARY); + /* Collective bin counter for the Binomial Reduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_BINOMIAL); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_BINOMIAL); + /* Collective bin counter for the In Order Binary Tree Reduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_IN_ORDER_BINTREE); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_IN_ORDER_BINTREE); + /* Collective bin counter for the Linear Reduce algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_LINEAR); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_LINEAR); + + /* Reduce Scatter Algorithms */ + /* Collective bin counter for the Nonoverlapping Reduce Scatter algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_SCATTER_NONOVERLAPPING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_SCATTER_NONOVERLAPPING); + /* Collective bin counter for the Recursive Halving Reduce Scatter algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_SCATTER_RECURSIVE_HALVING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_SCATTER_RECURSIVE_HALVING); + /* Collective bin counter for the Ring Reduce Scatter algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_REDUCE_SCATTER_RING); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_REDUCE_SCATTER_RING); + + /* Scatter Algorithms */ + /* Collective bin counter for the Binomial Scatter algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_SCATTER_BINOMIAL); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_SCATTER_BINOMIAL); + /* Collective bin counter for the Linear Scatter algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_SCATTER_LINEAR); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_SCATTER_LINEAR); + +/* Form for adding new collective bin counters */ +#if 0 + /* X Algorithms */ + /* Collective bin counter for the X algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_); + /* Collective bin counter for the X algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_); + /* Collective bin counter for the X algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_); + /* Collective bin counter for the X algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_); + /* Collective bin counter for the X algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_); + /* Collective bin counter for the X algorithm */ + SET_SPC_BIT(ompi_spc_collective_bin_event, OMPI_SPC_BASE_); + SET_SPC_BIT(ompi_spc_bin_event, OMPI_SPC_BASE_); +#endif + + for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { + if(IS_SPC_BIT_SET(ompi_spc_collective_bin_event,i)) { + data_size += 4 * (sizeof(int) + sizeof(ompi_spc_value_t)); + ompi_spc_offsets[i].num_bins = 4; + } + } + + /* ############################################################################### + * ############################# MMAP Initialization ############################# + * ############################################################################### + */ + uint page_size = opal_getpagesize(); + int bytes_needed = page_size * ((data_size + page_size - 1) % page_size); + + if(ompi_mpi_spc_mmap_enabled) { + rc = opal_shmem_segment_create(&shm_ds, sm_file, bytes_needed); + if (OPAL_SUCCESS != rc) { + opal_show_help("help-mpi-runtime.txt", "spc: shm segment creation failure", true); + goto map_failed; + } + int default_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; + shm_fd = open(sm_file, O_RDWR | O_CREAT | O_NONBLOCK, default_permissions); + if(0 > shm_fd) { + opal_show_help("help-mpi-runtime.txt", "spc: shm file open failure", true, strerror(errno)); + goto map_failed; + } + + my_segment = opal_shmem_segment_attach(&shm_ds); + if(NULL == my_segment) { + opal_show_help("help-mpi-runtime.txt", "spc: shm attach failure", true); + goto map_failed; + } + } + + /* If the mmap fails, we can fall back to malloc to allocate the data. If malloc fails, then we can't + * continue and the counters will have to be disabled. + */ + if(!ompi_mpi_spc_mmap_enabled) { + goto map_failed; + } + if(MAP_FAILED == (ompi_spc_events = mmap(0, bytes_needed, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0))) { + opal_show_help("help-mpi-runtime.txt", "spc: mmap failure", true, strerror(errno)); + map_failed: + mmap_failed = true; + ompi_spc_events = NULL; + /* mmap failed, so try malloc */ + if(NULL == (ompi_spc_events = malloc(bytes_needed))) { opal_show_help("help-mpi-runtime.txt", "lib-call-fail", true, "malloc", __FILE__, __LINE__); + spc_enabled = false; return; + } else { + need_free = true; /* Since we malloc'd this data we will need to free it */ } } + + ompi_spc_values = (ompi_spc_value_t*)ompi_spc_events; + + /* Write the XML file header with the basic spc information. + * NOTE: If we failed to mmap the data file, there is no need to write the XML file. + */ + if(ompi_mpi_spc_mmap_enabled && !mmap_failed) { + fprintf(fptr, "\t%s\n", sm_file); + fprintf(fptr, "\t%d\n", OMPI_SPC_NUM_COUNTERS * sizeof(ompi_spc_t)); + fprintf(fptr, "\t%d\n", OMPI_SPC_NUM_COUNTERS); + fprintf(fptr, "\t%d\n", sys_clock_freq_mhz); + } + /* The data structure has been allocated, so we simply initialize all of the counters * with their names and an initial count of 0. */ for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { - ompi_spc_events[i].name = (char*)ompi_spc_events_names[i].counter_name; - ompi_spc_events[i].value = 0; + ompi_spc_values[i] = 0; + + /* Add this counter to the XML document */ + if(ompi_mpi_spc_mmap_enabled && !mmap_failed) { + fprintf(fptr, "\t\n"); + fprintf(fptr, "\t\t%s\n", ompi_spc_events_names[i].counter_name); + fprintf(fptr, "\t\t%d\n", value_offset); + } + value_offset += sizeof(ompi_spc_value_t); + + if(ompi_spc_offsets[i].num_bins > 0) { + ompi_spc_offsets[i].rules_offset = bin_offset; + bin_offset += ompi_spc_offsets[i].num_bins*sizeof(int); + ompi_spc_offsets[i].bins_offset = bin_offset; + bin_offset += ompi_spc_offsets[i].num_bins*sizeof(ompi_spc_value_t); + + /* Make sure the bin offset is cache aligned to avoid false sharing */ + mod = bin_offset % SPC_CACHE_LINE; + if(mod != 0) { + bin_offset += SPC_CACHE_LINE - mod; + } + } else { + ompi_spc_offsets[i].rules_offset = -1; + ompi_spc_offsets[i].bins_offset = -1; + } + if(ompi_mpi_spc_mmap_enabled && !mmap_failed) { + fprintf(fptr, "\t\t%d\n", ompi_spc_offsets[i].rules_offset); + fprintf(fptr, "\t\t%d\n", ompi_spc_offsets[i].bins_offset); + fprintf(fptr, "\t\n"); + } } - ompi_comm_dup(&ompi_mpi_comm_world.comm, &ompi_spc_comm); + if(ompi_mpi_spc_mmap_enabled && !mmap_failed) { + fprintf(fptr, "\n"); + fclose(fptr); + } } /* Initializes the SPC data structures and registers all counters as MPI_T pvars. @@ -280,6 +858,9 @@ void ompi_spc_init(void) sys_clock_freq_mhz = opal_timer_base_get_freq() / 1000000; ompi_spc_events_init(); + if(!spc_enabled) { + return; + } /* Get the MCA params string of counters to turn on */ char **arg_strings = opal_argv_split(ompi_mpi_spc_attach_string, ','); @@ -292,11 +873,11 @@ void ompi_spc_init(void) if(strcmp(arg_strings[0], "all") == 0) { all_on = 1; } + } else if(0 == num_args) { + goto no_counters; } for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { - /* Reset all timer-based counters */ - CLEAR_SPC_BIT(ompi_spc_timer_event, i); matched = all_on; if( !matched ) { @@ -308,8 +889,7 @@ void ompi_spc_init(void) } } } - - if (matched) { + if( matched ) { SET_SPC_BIT(ompi_spc_attached_event, i); mpi_t_enabled = true; found++; @@ -317,20 +897,64 @@ void ompi_spc_init(void) /* Registers the current counter as an MPI_T pvar regardless of whether it's been turned on or not */ ret = mca_base_pvar_register("ompi", "runtime", "spc", ompi_spc_events_names[i].counter_name, ompi_spc_events_names[i].counter_description, - OPAL_INFO_LVL_4, MPI_T_PVAR_CLASS_SIZE, + OPAL_INFO_LVL_4, MPI_T_PVAR_CLASS_COUNTER, MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG, NULL, MPI_T_BIND_NO_OBJECT, MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, - ompi_spc_get_count, NULL, ompi_spc_notify, (void*)(uintptr_t)i); - if( ret < 0 ) { + ompi_spc_get_count, NULL, ompi_spc_notify, NULL); + + /* Check to make sure that ret is a valid index and not an error code. + */ + if( ret >= 0 ) { + if( mpi_t_offset == -1 ) { + mpi_t_offset = ret; + } + } + if( (ret < 0) || (all_on && (ret != (mpi_t_offset + found - 1))) ) { mpi_t_enabled = false; opal_show_help("help-mpi-runtime.txt", "spc: MPI_T disabled", true); break; } } - /* If this is a timer event, set the corresponding timer_event entry */ - SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_MATCH_TIME); + /* ######################################################################## + * ###################### Initialize Bin Counters Here #################### + * ######################################################################## + */ + /* NOTE: These should be initialized even if they aren't currently turned on. + * This is just in case they are turned on mid-run through MPI_T. + */ + /* STEP 3a: Regular bin counters initialized here. */ + int *rules = NULL; + ompi_spc_value_t *bins = NULL; + + rules = (int*)(ompi_spc_events+ompi_spc_offsets[OMPI_SPC_P2P_MESSAGE_SIZE].rules_offset); + bins = (ompi_spc_value_t*)(ompi_spc_events+ompi_spc_offsets[OMPI_SPC_P2P_MESSAGE_SIZE].bins_offset); + + bins[0] = bins[1] = 0; + + rules[0] = 2; /* The number of bins */ + rules[1] = ompi_mpi_spc_p2p_message_boundary; /* The number after which counters go in the second bin */ + + /* Initialize Collective Bin Counters Here */ + int num_bins = 4; /* This can be expanded to be more flexible */ + + for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { + if(IS_SPC_BIT_SET(ompi_spc_collective_bin_event,i)) { + rules = (int*)(ompi_spc_events+ompi_spc_offsets[i].rules_offset); + bins = (ompi_spc_value_t*)(ompi_spc_events+ompi_spc_offsets[i].bins_offset); + + bins[0] = bins[1] = bins[2] = bins[3] = 0; + rules[0] = num_bins; /* The number of bins */ + rules[1] = ompi_mpi_spc_collective_message_boundary; /* The 'small message' break point */ + rules[2] = ompi_mpi_spc_collective_comm_boundary; /* The 'small communicator' break point */ + rules[3] = 0; /* Placeholder for now */ + + ompi_spc_offsets[i].num_bins = 4; + } + } + +no_counters: opal_argv_free(arg_strings); } @@ -339,8 +963,10 @@ void ompi_spc_init(void) */ static void ompi_spc_dump(void) { - int i, j, world_size, offset; + int i, j, k, world_size, offset, bin_offset; long long *recv_buffer = NULL, *send_buffer; + int *rules; + ompi_spc_value_t *bins; int rank = ompi_comm_rank(ompi_spc_comm); world_size = ompi_comm_size(ompi_spc_comm); @@ -348,30 +974,65 @@ static void ompi_spc_dump(void) /* Convert from cycles to usecs before sending */ for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { if( IS_SPC_BIT_SET(ompi_spc_timer_event, i) ) { - SPC_CYCLES_TO_USECS(&ompi_spc_events[i].value); + SPC_CYCLES_TO_USECS(&ompi_spc_values[i]); + } + } + + size_t buffer_size = OMPI_SPC_NUM_COUNTERS * sizeof(long long); + int buffer_len = OMPI_SPC_NUM_COUNTERS; + for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++){ + if(IS_SPC_BIT_SET(ompi_spc_bin_event, i)) { + /* Increment the buffer size enough to store the bin_rules and bins values */ + rules = (int*)(ompi_spc_events+ompi_spc_offsets[i].rules_offset); + buffer_size += rules[0] * 2 * sizeof(long long); + buffer_len += rules[0] * 2; } } /* Aggregate all of the information on rank 0 using MPI_Gather on MPI_COMM_WORLD */ - send_buffer = (long long*)malloc(OMPI_SPC_NUM_COUNTERS * sizeof(long long)); + send_buffer = (long long*)malloc(buffer_size); if (NULL == send_buffer) { opal_show_help("help-mpi-runtime.txt", "lib-call-fail", true, "malloc", __FILE__, __LINE__); return; } + bin_offset = OMPI_SPC_NUM_COUNTERS; for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { - send_buffer[i] = (long long)ompi_spc_events[i].value; + send_buffer[i] = (long long)ompi_spc_values[i]; + /* If this is a bin counter we need to append its arrays to the end of the send buffer */ + if(IS_SPC_BIT_SET(ompi_spc_bin_event, i)) { + rules = (int*)(ompi_spc_events+ompi_spc_offsets[i].rules_offset); + for(j = 0; j < rules[0]; j++) { + send_buffer[bin_offset] = (long long)rules[j]; + bin_offset++; + } + /* A flag to check if all bins are 0 */ + int is_empty = 1; + bins = (ompi_spc_value_t*)(ompi_spc_events+ompi_spc_offsets[i].bins_offset); + for(j = 0; j < rules[0]; j++) { + send_buffer[bin_offset] = (long long)bins[j]; + bin_offset++; + + if(bins[j] > 0) { + is_empty = 0; + } + } + /* Even if all bins are 0 we still send it for ease, even though it won't be printed */ + if(!is_empty && !IS_SPC_BIT_SET(ompi_spc_collective_bin_event, i)) { + send_buffer[i] = 1; + } + } } if( 0 == rank ) { - recv_buffer = (long long*)malloc(world_size * OMPI_SPC_NUM_COUNTERS * sizeof(long long)); + recv_buffer = (long long*)malloc(world_size * buffer_size); if (NULL == recv_buffer) { opal_show_help("help-mpi-runtime.txt", "lib-call-fail", true, "malloc", __FILE__, __LINE__); return; } } - (void)ompi_spc_comm->c_coll->coll_gather(send_buffer, OMPI_SPC_NUM_COUNTERS, MPI_LONG_LONG, - recv_buffer, OMPI_SPC_NUM_COUNTERS, MPI_LONG_LONG, + (void)ompi_spc_comm->c_coll->coll_gather(send_buffer, buffer_len, MPI_LONG_LONG, + recv_buffer, buffer_len, MPI_LONG_LONG, 0, ompi_spc_comm, ompi_spc_comm->c_coll->coll_gather_module); @@ -379,25 +1040,65 @@ static void ompi_spc_dump(void) if(rank == 0) { opal_output(0, "Open MPI Software-based Performance Counters:\n"); offset = 0; /* Offset into the recv_buffer for each rank */ + bin_offset = OMPI_SPC_NUM_COUNTERS; for(j = 0; j < world_size; j++) { opal_output(0, "MPI_COMM_WORLD Rank %d:\n", j); for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { - /* If this is a timer-based counter, we need to covert from cycles to usecs */ + /* Don't print counters with zero values */ if( 0 == recv_buffer[offset+i] ) { + if(IS_SPC_BIT_SET(ompi_spc_bin_event, i)) { + bin_offset += recv_buffer[bin_offset]*2; + } continue; } - opal_output(0, "%s -> %lld\n", ompi_spc_events[i].name, recv_buffer[offset+i]); + /* This is a non-zero bin counter */ + if(IS_SPC_BIT_SET(ompi_spc_bin_event, i)) { + /* This is a non-zero collective bin counter */ + if(IS_SPC_BIT_SET(ompi_spc_collective_bin_event, i)) { + opal_output(0, "%s -> %lld\n", ompi_spc_events_names[i].counter_name, recv_buffer[offset+i]); + + int num_bins = recv_buffer[bin_offset]; + int message_boundary = recv_buffer[bin_offset+1]; + int process_boundary = recv_buffer[bin_offset+2]; + + opal_output(0, "\tSmall Messages (<= %lld bytes)\n", message_boundary); + opal_output(0, "\t\tSmall Comm (<= %lld processes) -> %lld\n", process_boundary, recv_buffer[bin_offset+num_bins]); + opal_output(0, "\t\tLarge Comm (> %lld processes) -> %lld\n", process_boundary, recv_buffer[bin_offset+num_bins+1]); + opal_output(0, "\tLarge Messages (> %lld bytes)\n", message_boundary); + opal_output(0, "\t\tSmall Comm (<= %lld processes) -> %lld\n", process_boundary, recv_buffer[bin_offset+num_bins+2]); + opal_output(0, "\t\tLarge Comm (> %lld processes) -> %lld\n", process_boundary, recv_buffer[bin_offset+num_bins+3]); + + bin_offset += num_bins*2; + continue; + } + opal_output(0, "%s\n", ompi_spc_events_names[i].counter_name); + int num_bins = recv_buffer[bin_offset]; + for(k = 0; k < num_bins; k++){ + if(k == 0) { + opal_output(0, "\t-inf to %lld -> %lld\n", recv_buffer[bin_offset+1], recv_buffer[bin_offset+num_bins]); + } else if(k < num_bins-1) { + opal_output(0, "\t%lld to %lld -> %lld\n", recv_buffer[bin_offset+k]+1, recv_buffer[bin_offset+k+1], recv_buffer[bin_offset+num_bins+k]); + } else { + opal_output(0, "\t%lld to inf -> %lld\n", recv_buffer[bin_offset+k]+1, recv_buffer[bin_offset+num_bins+k]); + } + } + bin_offset += num_bins*2; + continue; + } + /* This is a non-zero normal counter */ + opal_output(0, "%s -> %lld\n", ompi_spc_events_names[i].counter_name, recv_buffer[offset+i]); } opal_output(0, "\n"); - offset += OMPI_SPC_NUM_COUNTERS; + offset += buffer_len; + bin_offset += OMPI_SPC_NUM_COUNTERS; } - printf("###########################################################################\n"); - printf("NOTE: Any counters not shown here were either disabled or had a value of 0.\n"); - printf("###########################################################################\n"); + opal_output(0, "###########################################################################\n"); + opal_output(0, "NOTE: Any counters not shown here were either disabled or had a value of 0.\n"); + opal_output(0, "###########################################################################\n"); - free(recv_buffer); + free(recv_buffer); recv_buffer = NULL; } - free(send_buffer); + free(send_buffer); send_buffer = NULL; ompi_spc_comm->c_coll->coll_barrier(ompi_spc_comm, ompi_spc_comm->c_coll->coll_barrier_module); } @@ -405,20 +1106,81 @@ static void ompi_spc_dump(void) /* Frees any dynamically alocated OMPI SPC data structures */ void ompi_spc_fini(void) { + int fd, rc; + char sm_file[SPC_MAX_FILENAME]; + char *shm_dir = SPC_SHM_DIR; + + int rank = ompi_comm_rank(ompi_spc_comm); + if (SPC_ENABLE == 1 && ompi_mpi_spc_dump_enabled) { ompi_spc_dump(); } - free(ompi_spc_events); ompi_spc_events = NULL; - ompi_comm_free(&ompi_spc_comm); + int i; + for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { + CLEAR_SPC_BIT(ompi_spc_attached_event, i); + CLEAR_SPC_BIT(ompi_spc_timer_event, i); + CLEAR_SPC_BIT(ompi_spc_bin_event, i); + CLEAR_SPC_BIT(ompi_spc_collective_bin_event, i); + } + if(need_free) { + free(ompi_spc_events); ompi_spc_events = NULL; + } + ompi_comm_free(&ompi_spc_comm); ompi_spc_comm = NULL; } /* Records an update to a counter using an atomic add operation. */ void ompi_spc_record(unsigned int event_id, ompi_spc_value_t value) { - /* Denoted unlikely because counters will often be turned off. */ - if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) { - OPAL_THREAD_ADD_FETCH_SIZE_T(&(ompi_spc_events[event_id].value), value); + OPAL_THREAD_ADD_FETCH_SIZE_T(&(ompi_spc_values[event_id]), value); +} + +/* Records an update to a bin counter using an atomic add operation. */ +void ompi_spc_bin_record(unsigned int event_id, ompi_spc_value_t value) +{ + int *rules; + ompi_spc_value_t *bins; + + /* Update the total number of times this counter has been triggered */ + OPAL_THREAD_ADD_FETCH_SIZE_T(&(ompi_spc_values[event_id]), 1); + rules = (int*)(ompi_spc_events+ompi_spc_offsets[event_id].rules_offset); + bins = (ompi_spc_value_t*)(ompi_spc_events+ompi_spc_offsets[event_id].bins_offset); + + int i, num_bins = rules[0]; + /* Update the appropriate bin */ + for(i = 1; i < num_bins; i++) { + if(value <= rules[i]) { + OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[i-1]), 1); + return; + } + } + /* This didn't fall within any of the other bins, so it must belong to the last bin */ + OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[num_bins-1]), 1); +} + +/* Records an update to a collective bin counter using an atomic add operation. */ +void ompi_spc_collective_bin_record(unsigned int event_id, ompi_spc_value_t bytes, ompi_spc_value_t procs) +{ + int *rules; + ompi_spc_value_t *bins; + + rules = (int*)(ompi_spc_events+ompi_spc_offsets[event_id].rules_offset); + bins = (ompi_spc_value_t*)(ompi_spc_events+ompi_spc_offsets[event_id].bins_offset); + + uint small_message = (bytes <= rules[1]); + uint small_comm = (procs <= rules[2]); + /* Always update the total number of times this collective algorithm was called */ + OPAL_THREAD_ADD_FETCH_SIZE_T(&(ompi_spc_values[event_id]), 1); + + /* Update the appropriate bin for the message size and number of processes */ + if(small_message && small_comm) { + OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[0]), 1); + } else if(small_message && !small_comm) { + OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[1]), 1); + } else if(!small_message && small_comm) { + OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[2]), 1); + } else { + OPAL_THREAD_ADD_FETCH_SIZE_T(&(bins[3]), 1); } } @@ -428,10 +1190,8 @@ void ompi_spc_record(unsigned int event_id, ompi_spc_value_t value) */ void ompi_spc_timer_start(unsigned int event_id, opal_timer_t *cycles) { - /* Check whether cycles == 0.0 to make sure the timer hasn't started yet. - * This is denoted unlikely because the counters will often be turned off. - */ - if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id) && *cycles == 0) ) { + /* Check whether cycles == 0.0 to make sure the timer hasn't started yet. */ + if( *cycles == 0 ) { *cycles = opal_timer_base_get_cycles(); } } @@ -442,11 +1202,8 @@ void ompi_spc_timer_start(unsigned int event_id, opal_timer_t *cycles) */ void ompi_spc_timer_stop(unsigned int event_id, opal_timer_t *cycles) { - /* This is denoted unlikely because the counters will often be turned off. */ - if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) { - *cycles = opal_timer_base_get_cycles() - *cycles; - OPAL_THREAD_ADD_FETCH_SIZE_T(&ompi_spc_events[event_id].value, (size_t) *cycles); - } + *cycles = opal_timer_base_get_cycles() - *cycles; + OPAL_THREAD_ADD_FETCH_SIZE_T(&ompi_spc_values[event_id], (ompi_spc_value_t) *cycles); } /* Checks a tag, and records the user version of the counter if it's greater @@ -460,21 +1217,33 @@ void ompi_spc_user_or_mpi(int tag, ompi_spc_value_t value, unsigned int user_enu /* Checks whether the counter denoted by value_enum exceeds the current value of the * counter denoted by watermark_enum, and if so sets the watermark_enum counter to the * value of the value_enum counter. + * + * WARNING: This assumes that this function was called while a lock has already been taken. + * This function is NOT thread safe otherwise! */ -void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_enum) +void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_enum, ompi_spc_value_t value) { - /* Denoted unlikely because counters will often be turned off. */ - if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, watermark_enum) && - IS_SPC_BIT_SET(ompi_spc_attached_event, value_enum)) ) { - /* WARNING: This assumes that this function was called while a lock has already been taken. - * This function is NOT thread safe otherwise! - */ - if(ompi_spc_events[value_enum].value > ompi_spc_events[watermark_enum].value) { - ompi_spc_events[watermark_enum].value = ompi_spc_events[value_enum].value; + OPAL_THREAD_ADD_FETCH_SIZE_T(&(ompi_spc_values[value_enum]), value); + if(IS_SPC_BIT_SET(ompi_spc_attached_event, watermark_enum)) { + if(ompi_spc_values[value_enum] > ompi_spc_values[watermark_enum]) { + ompi_spc_values[watermark_enum] = ompi_spc_values[value_enum]; } } } +/* Gets the value of an SPC counter. + * + * WARNING: This function is not performed atomically and may not return the most up to date + * value in a threaded run. + */ +ompi_spc_value_t ompi_spc_get_value(unsigned int event_id) +{ + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) { + return ompi_spc_values[event_id]; + } + return 0; +} + /* Converts a counter value that is in cycles to microseconds. */ void ompi_spc_cycles_to_usecs(ompi_spc_value_t *cycles) diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h index 5d040511c34..e3a8f88979d 100644 --- a/ompi/runtime/ompi_spc.h +++ b/ompi/runtime/ompi_spc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 The University of Tennessee and The University + * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2018 Research Organization for Information Science @@ -18,7 +18,12 @@ #include #include #include +#include #include +#include +#include /* For mode constants */ +#include /* For O_* constants */ +#include #include "ompi/communicator/communicator.h" #include "ompi/datatype/ompi_datatype.h" @@ -28,26 +33,98 @@ #include "opal/util/argv.h" #include "opal/util/show_help.h" #include "opal/util/output.h" +#include "opal/mca/shmem/base/base.h" +#include "opal/util/sys_limits.h" + +#define SPC_CACHE_LINE opal_cache_line_size /* The number of bytes in a cache line. */ +#define SPC_MAX_FILENAME PATH_MAX /* The maximum length allowed for the spc file strings */ +#define SPC_SHM_DIR "/dev/shm" /* The default directory for shared memory files */ #include MCA_timer_IMPLEMENTATION_HEADER +/* MCA Parameter Variables */ +/** + * A comma delimited list of SPC counters to turn on or 'attach'. To turn + * all counters on, the string can be simply "all". An empty string will + * keep all counters turned off. + */ +OMPI_DECLSPEC extern char * ompi_mpi_spc_attach_string; + +/** + * A string to append to the SPC XML files for using the mmap interface. + * This is to make the filename easier to identify. + */ +OMPI_DECLSPEC extern char * ompi_mpi_spc_xml_string; + +/** + * A boolean value that determines whether or not to dump the SPC counter + * values in MPI_Finalize. A value of true dumps the counters and false does not. + */ +OMPI_DECLSPEC extern bool ompi_mpi_spc_dump_enabled; + +/** + * A boolean value that determines whether or not to dump the SPC counter + * values in an mmap'd file during execution. A value of true dumps the + * counters and false does not. + */ +OMPI_DECLSPEC extern bool ompi_mpi_spc_mmap_enabled; + +/** + * An integer value that denotes the time period between snapshots with the + * SPC mmap interface. + */ +OMPI_DECLSPEC extern int ompi_mpi_spc_snapshot_period; + +/** + * An integer value that denotes the boundary at which a message is qualified + * as a small/large message for the point to point message counter. + */ +OMPI_DECLSPEC extern int ompi_mpi_spc_p2p_message_boundary; + +/** + * An integer value that denotes the boundary at which a message is qualified + * as a small/large message for collective bin counters. + */ +OMPI_DECLSPEC extern int ompi_mpi_spc_collective_message_boundary; + +/** + * An integer value that denotes the boundary at which a communicator is qualified + * as a small/large communicator for collective bin counters. + */ +OMPI_DECLSPEC extern int ompi_mpi_spc_collective_comm_boundary; + /* INSTRUCTIONS FOR ADDING COUNTERS * 1.) Add a new counter name in the ompi_spc_counters_t enum before * OMPI_SPC_NUM_COUNTERS below. - * 2.) Add corresponding counter name and descriptions to the - * counter_names and counter_descriptions arrays in + * 2.) Add corresponding counter description(s) to the + * ompi_spc_events_names definition in * ompi_spc.c NOTE: The names and descriptions * MUST be in the same array location as where you added the * counter name in step 1. - * 3.) If this counter is based on a timer, add its enum name to - * the logic for timer-based counters in the ompi_spc_init - * function in ompi_spc.c - * 4.) Instrument the Open MPI code base where it makes sense for - * your counter to be modified using the SPC_RECORD macro. + * Search For: 'STEP 2' + * 3.) If this counter is a specialized counter like a timer, + * bin, or collective bin add its enum name to the logic for + * specialized counters in the ompi_spc_init function in ompi_spc.c + * Search For: 'STEP 3' + * NOTE: If this is a bin counter, and not a collective bin counter, + * you will need to initialize it. Search for: 'STEP 3a' + * 4.) If this counter is a watermark counter, additional logic is required + * for when this counter is read. The standard behavior of watermark + * counters is to keep track of updates to another counter and increase + * when that tracked counter exceeds the current value of the high + * watermark. These counters are reset to the current value of the + * tracked counter whenever they are read through MPI_T in the + * ompi_spc_get_count function in ompi_spc.c + * Search For: 'STEP 4' + * 5.) Instrument the Open MPI code where it makes sense for + * your counter to be modified using the appropriate SPC macro. + * This will typically be SPC_RECORD, but could be SPC_BIN_RECORD, + * SPC_COLL_BIN_RECORD, SPC_UPDATE_WATERMARK, or SPC_TIMER_START/STOP + * depending on the counter. * Note: If your counter is timer-based you should use the * SPC_TIMER_START and SPC_TIMER_STOP macros to record * the time in cycles to then be converted to microseconds later - * in the ompi_spc_get_count function when requested by MPI_T + * in the ompi_spc_get_count function when requested by MPI_T. */ /* This enumeration serves as event ids for the various events */ @@ -155,25 +232,95 @@ typedef enum ompi_spc_counters { OMPI_SPC_BYTES_GET, OMPI_SPC_UNEXPECTED, OMPI_SPC_OUT_OF_SEQUENCE, + OMPI_SPC_OOS_QUEUE_HOPS, OMPI_SPC_MATCH_TIME, + OMPI_SPC_MATCH_QUEUE_TIME, + OMPI_SPC_OOS_MATCH_TIME, + OMPI_SPC_OOS_MATCH_QUEUE_TIME, OMPI_SPC_UNEXPECTED_IN_QUEUE, OMPI_SPC_OOS_IN_QUEUE, OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE, OMPI_SPC_MAX_OOS_IN_QUEUE, + OMPI_SPC_BASE_BCAST_LINEAR, + OMPI_SPC_BASE_BCAST_CHAIN, + OMPI_SPC_BASE_BCAST_PIPELINE, + OMPI_SPC_BASE_BCAST_SPLIT_BINTREE, + OMPI_SPC_BASE_BCAST_BINTREE, + OMPI_SPC_BASE_BCAST_BINOMIAL, + OMPI_SPC_BASE_REDUCE_CHAIN, + OMPI_SPC_BASE_REDUCE_PIPELINE, + OMPI_SPC_BASE_REDUCE_BINARY, + OMPI_SPC_BASE_REDUCE_BINOMIAL, + OMPI_SPC_BASE_REDUCE_IN_ORDER_BINTREE, + OMPI_SPC_BASE_REDUCE_LINEAR, + OMPI_SPC_BASE_REDUCE_SCATTER_NONOVERLAPPING, + OMPI_SPC_BASE_REDUCE_SCATTER_RECURSIVE_HALVING, + OMPI_SPC_BASE_REDUCE_SCATTER_RING, + OMPI_SPC_BASE_ALLREDUCE_NONOVERLAPPING, + OMPI_SPC_BASE_ALLREDUCE_RECURSIVE_DOUBLING, + OMPI_SPC_BASE_ALLREDUCE_RING, + OMPI_SPC_BASE_ALLREDUCE_RING_SEGMENTED, + OMPI_SPC_BASE_ALLREDUCE_LINEAR, + OMPI_SPC_BASE_SCATTER_BINOMIAL, + OMPI_SPC_BASE_SCATTER_LINEAR, + OMPI_SPC_BASE_GATHER_BINOMIAL, + OMPI_SPC_BASE_GATHER_LINEAR_SYNC, + OMPI_SPC_BASE_GATHER_LINEAR, + OMPI_SPC_BASE_ALLTOALL_INPLACE, + OMPI_SPC_BASE_ALLTOALL_PAIRWISE, + OMPI_SPC_BASE_ALLTOALL_BRUCK, + OMPI_SPC_BASE_ALLTOALL_LINEAR_SYNC, + OMPI_SPC_BASE_ALLTOALL_TWO_PROCS, + OMPI_SPC_BASE_ALLTOALL_LINEAR, + OMPI_SPC_BASE_ALLGATHER_BRUCK, + OMPI_SPC_BASE_ALLGATHER_RECURSIVE_DOUBLING, + OMPI_SPC_BASE_ALLGATHER_RING, + OMPI_SPC_BASE_ALLGATHER_NEIGHBOR_EXCHANGE, + OMPI_SPC_BASE_ALLGATHER_TWO_PROCS, + OMPI_SPC_BASE_ALLGATHER_LINEAR, + OMPI_SPC_BASE_BARRIER_DOUBLE_RING, + OMPI_SPC_BASE_BARRIER_RECURSIVE_DOUBLING, + OMPI_SPC_BASE_BARRIER_BRUCK, + OMPI_SPC_BASE_BARRIER_TWO_PROCS, + OMPI_SPC_BASE_BARRIER_LINEAR, + OMPI_SPC_BASE_BARRIER_TREE, + OMPI_SPC_P2P_MESSAGE_SIZE, + OMPI_SPC_EAGER_MESSAGES, + OMPI_SPC_NOT_EAGER_MESSAGES, + OMPI_SPC_QUEUE_ALLOCATION, + OMPI_SPC_MAX_QUEUE_ALLOCATION, + OMPI_SPC_UNEXPECTED_QUEUE_DATA, + OMPI_SPC_MAX_UNEXPECTED_QUEUE_DATA, + OMPI_SPC_OOS_QUEUE_DATA, + OMPI_SPC_MAX_OOS_QUEUE_DATA, OMPI_SPC_NUM_COUNTERS /* This serves as the number of counters. It must be last. */ } ompi_spc_counters_t; +extern uint32_t ompi_spc_attached_event[]; + /* There is currently no support for atomics on long long values so we will default to * size_t for now until support for such atomics is implemented. */ typedef opal_atomic_size_t ompi_spc_value_t; /* A structure for storing the event data */ -typedef struct ompi_spc_s{ +typedef struct ompi_spc_s { char *name; ompi_spc_value_t value; + int *bin_rules; /* The first element is the number of bins, the rest represent when each bin starts */ + ompi_spc_value_t *bins; } ompi_spc_t; +/* A structure for indexing into the event data */ +typedef struct ompi_spc_offset_s { + int num_bins; + int rules_offset; + int bins_offset; +} ompi_spc_offset_t; + +/* MCA Parameters Initialization Function */ +int ompi_spc_register_params(void); + /* Events data structure initialization function */ void ompi_spc_events_init(void); @@ -181,11 +328,15 @@ void ompi_spc_events_init(void); void ompi_spc_init(void); void ompi_spc_fini(void); void ompi_spc_record(unsigned int event_id, ompi_spc_value_t value); +void ompi_spc_bin_record(unsigned int event_id, ompi_spc_value_t value); +void ompi_spc_collective_bin_record(unsigned int event_id, ompi_spc_value_t bytes, ompi_spc_value_t procs); void ompi_spc_timer_start(unsigned int event_id, opal_timer_t *cycles); void ompi_spc_timer_stop(unsigned int event_id, opal_timer_t *cycles); void ompi_spc_user_or_mpi(int tag, ompi_spc_value_t value, unsigned int user_enum, unsigned int mpi_enum); void ompi_spc_cycles_to_usecs(ompi_spc_value_t *cycles); -void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_enum); +void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_enum, ompi_spc_value_t value); +ompi_spc_value_t ompi_spc_get_value(unsigned int event_id); +bool IS_SPC_BIT_SET(uint32_t* array, int32_t pos); /* Macros for using the SPC utility functions throughout the codebase. * If SPC_ENABLE is not 1, the macros become no-ops. @@ -199,22 +350,41 @@ void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_e ompi_spc_fini() #define SPC_RECORD(event_id, value) \ - ompi_spc_record(event_id, value) + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) \ + ompi_spc_record(event_id, value) + +#define SPC_BIN_RECORD(event_id, value) \ + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) \ + ompi_spc_bin_record(event_id, value) + +#define SPC_COLL_BIN_RECORD(event_id, bytes, procs) \ + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) \ + ompi_spc_collective_bin_record(event_id, bytes, procs) #define SPC_TIMER_START(event_id, usec) \ - ompi_spc_timer_start(event_id, usec) + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) \ + ompi_spc_timer_start(event_id, usec) #define SPC_TIMER_STOP(event_id, usec) \ - ompi_spc_timer_stop(event_id, usec) + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, event_id)) ) \ + ompi_spc_timer_stop(event_id, usec) #define SPC_USER_OR_MPI(tag, value, enum_if_user, enum_if_mpi) \ - ompi_spc_user_or_mpi(tag, value, enum_if_user, enum_if_mpi) + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, enum_if_user) || IS_SPC_BIT_SET(ompi_spc_attached_event, enum_if_mpi)) ) \ + ompi_spc_user_or_mpi(tag, value, enum_if_user, enum_if_mpi) #define SPC_CYCLES_TO_USECS(cycles) \ ompi_spc_cycles_to_usecs(cycles) -#define SPC_UPDATE_WATERMARK(watermark_enum, value_enum) \ - ompi_spc_update_watermark(watermark_enum, value_enum) +/* WARNING: This macro assumes that it was called while a lock has already been taken. + * This function is NOT thread safe otherwise! + */ +#define SPC_UPDATE_WATERMARK(watermark_enum, value_enum, value) \ + if( OPAL_UNLIKELY(IS_SPC_BIT_SET(ompi_spc_attached_event, value_enum)) ) \ + ompi_spc_update_watermark(watermark_enum, value_enum, value) + +#define SPC_GET(event_id) \ + ompi_spc_get_value(event_id) #else /* SPCs are not enabled */ @@ -227,6 +397,12 @@ void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_e #define SPC_RECORD(event_id, value) \ ((void)0) +#define SPC_BIN_RECORD(event_id, value) \ + ((void)0) + +#define SPC_COLL_BIN_RECORD(event_id, bytes, procs) \ + ((void)0) + #define SPC_TIMER_START(event_id, usec) \ ((void)0) @@ -239,7 +415,10 @@ void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_e #define SPC_CYCLES_TO_USECS(cycles) \ ((void)0) -#define SPC_UPDATE_WATERMARK(watermark_enum, value_enum) \ +#define SPC_UPDATE_WATERMARK(watermark_enum, value_enum, value) \ + ((void)0) + +#define SPC_GET(event_id) \ ((void)0) #endif diff --git a/ompi/runtime/ompi_spc_documentation.md b/ompi/runtime/ompi_spc_documentation.md new file mode 100644 index 00000000000..7088c364de5 --- /dev/null +++ b/ompi/runtime/ompi_spc_documentation.md @@ -0,0 +1,831 @@ +## What Are Software Performance Counters? +Software Performance Counters (SPCs) are modelled after [PAPI's](http://icl.utk.edu/papi/) hardware performance counters, but are designed to keep track of information that originates from software rather than hardware. The basic idea is to add instrumentation to a software package to keep track of performance metrics of interest and provide access to them in a portable way. The implementation of SPCs within Open MPI exposes information about the internal operation of Open MPI that would otherwise not be available to users and tool developers. These counters have been integrated with the MPI Tool Information Interface (MPI_T) as performance variables, or pvars, to make this information more readily available to tool developers and users. These counters can also be accessed through an mmap-based interface at runtime. + +## Building OMPI With SPC Support +By default, Open MPI does not build with SPCs enabled, so all of the instrumentation code becomes no-ops. Building Open MPI with SPCs is as simple as adding `--enable-spc` to your configure line. Once SPCs have been built in, there are several MCA parameters that are used to manage SPCs at runtime: + +- mpi_spc_attach: A string used to turn specific counters on. There are two reserved strings, 'all' and 'none', for turning on all and none of the counters respectively. Otherwise, this should be a comma-separated list of counters to turn on using the counters names. The default value is 'none'. +- mpi_spc_dump_enabled: A boolean parameter denoted by a 'true' or 'false' string that determines whether or not to print the counter values to stdout during MPI_Finalize. The default value is 'false' +- mpi_spc_mmap_enabled: A boolean parameter denoted by a 'true' or 'false' string that determines whether or not to use the mmap interface for storing the SPC data. The default value is 'false' +- mpi_spc_xml_string: A string that is appended to the XML file created by the mmap interface for easy identification. For example, if this variable is set to the value of 'test', the resultant XML filename would be spc_data.[nodename].test.[world rank].xml. By default this string is empty, which results in the 'test' value being replaced by the Open MPI jobid. +- mpi_spc_p2p_message boundary: An integer value denoting the point after which messages are determined to be 'large' messages within OMPI_SPC_P2P_MESSAGE_SIZE bin counter. The default value is 12288. +- mpi_spc_collective_message_boundary: An integer value denoting the point after which messages are determined to be 'large' messages for collective bin counters. The default value is 12288. +- mpi_spc_collective_comm_boundary: An integer value denoting the point after which a communicator is determined to be 'large' for collective bin counters. The default value is 64. + +Setting these MCA parameters can be done via the command line like so: + +`mpirun -np X --mca mpi_spc_attach OMPI_SPC_SEND,OMPI_SPC_RECV --mca mpi_spc_dump_enabled true ./your_app` + +These MCA parameters can also be set inside an mca-params.conf file like so: + +```bash +mpi_spc_attach = all +mpi_spc_dump_enabled = true +mpi_spc_mmap_enabled = true +mpi_spc_xml_string = myXMLstring +``` + +## Using SPCs Through MPI_T +All of the SPC counters are registered with MPI_T as pvars, which means that they can be easily accessed by tools. It is worth noting that if any of the SPCs fail to register with MPI_T, all counters are turned off with respect to MPI_T. This is a design decision to create a fast translation between MPI_T indices and SPC indices. + +The following is a simple example C program that will show how these counters could be used through MPI_T in practice. Essentially, this example sends some number of messages of some size both specified by the user from process rank 0 to rank 1. Rank 0 uses an MPI_T pvar to report the number of times the binomial algorithm was used for a broadcast under different conditions Small/Large communicator/message size, and rank 1 registers an MPI_T pvar to determine the number of bytes received through point to point communications. This example can also be found in the examples directory of the Open MPI repository. + +```c +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Simple example usage of SPCs through MPI_T. + */ + +#include +#include +#include +#include + +#include "mpi.h" + +/* Sends 'num_messages' messages of 'message_size' bytes from rank 0 to rank 1. + * All messages are send synchronously and with the same tag in MPI_COMM_WORLD. + */ +void message_exchange(int num_messages, int message_size) +{ + int i, rank; + /* Use calloc to initialize data to 0's */ + char *data = (char*)calloc(message_size, sizeof(char)); + MPI_Status status; + MPI_Request req; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + /* This is designed to have at least num_messages unexpected messages in order to + * hit the unexpected message queue counters. The broadcasts are here to showcase + * the collective bin counters and the P2P and Eager message counters. + */ + if(rank == 0) { + for(i = 0; i < num_messages; i++) { + MPI_Isend(data, message_size, MPI_BYTE, 1, 123, MPI_COMM_WORLD, &req); + } + MPI_Send(data, message_size, MPI_BYTE, 1, 321, MPI_COMM_WORLD); + for(i = 0; i < num_messages; i++) { + MPI_Bcast(data, message_size, MPI_BYTE, 0, MPI_COMM_WORLD); + } + } else if(rank == 1) { + MPI_Recv(data, message_size, MPI_BYTE, 0, 321, MPI_COMM_WORLD, &status); + for(i = 0; i < num_messages; i++) { + MPI_Recv(data, message_size, MPI_BYTE, 0, 123, MPI_COMM_WORLD, &status); + } + for(i = 0; i < num_messages; i++) { + MPI_Bcast(data, message_size, MPI_BYTE, 0, MPI_COMM_WORLD); + } + } + /* This should use the binomial algorithm so it has at least one counter value */ + MPI_Bcast(data, 1, MPI_BYTE, 0, MPI_COMM_WORLD); + + free(data); +} + +int main(int argc, char **argv) +{ + int num_messages, message_size, rc; + + if(argc < 3) { + printf("Usage: mpirun -np 2 --mca mpi_spc_attach all --mca mpi_spc_dump_enabled true ./spc_example [num_messages] [message_size]\n"); + return -1; + } else { + num_messages = atoi(argv[1]); + message_size = atoi(argv[2]); + if(message_size <= 0) { + printf("Message size must be positive.\n"); + return -1; + } + } + + int i, j, rank, size, provided, num, name_len, desc_len, verbosity, bind, var_class, readonly, continuous, atomic, count, index, xml_index; + MPI_Datatype datatype; + MPI_T_enum enumtype; + MPI_Comm comm; + char name[256], description[256]; + + /* Counter names to be read by ranks 0 and 1 */ + char *counter_names[] = {"runtime_spc_OMPI_SPC_BASE_BCAST_BINOMIAL", + "runtime_spc_OMPI_SPC_BYTES_RECEIVED_USER" }; + char *xml_counter = "runtime_spc_OMPI_SPC_XML_FILE"; + + MPI_Init(NULL, NULL); + MPI_T_init_thread(MPI_THREAD_SINGLE, &provided); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + if(size != 2) { + fprintf(stderr, "ERROR: This test should be run with two MPI processes.\n"); + MPI_Abort(MPI_COMM_WORLD, -1); + } + + if(rank == 0) { + printf("##################################################################\n"); + printf("This test is designed to highlight several different SPC counters.\n"); + printf("The MPI workload of this test will use 1 MPI_Send and %d MPI_Isend\n", num_messages); + printf("operation(s) on the sender side (rank 0) and %d MPI_Recv operation(s)\n", num_messages+1); + printf("on the receiver side (rank 1) in such a way that at least %d message(s)\n", num_messages); + printf("are unexpected. This highlights the unexpected message queue SPCs.\n"); + printf("There will also be %d MPI_Bcast operation(s) with one of them being of\n", num_messages+1); + printf("size 1 byte, and %d being of size %d byte(s). The 1 byte MPI_Bcast is\n", num_messages, message_size); + printf("meant to ensure that there is at least one MPI_Bcast that uses the\n"); + printf("binomial algorithm so the MPI_T pvar isn't all 0's. The addition of\n"); + printf("the broadcasts also has the effect of showcasing the P2P message size,\n"); + printf("eager vs not eager message, and bytes sent by the user vs MPI SPCs.\n"); + printf("Be sure to set the mpi_spc_dump_enabled MCA parameter to true in order\n"); + printf("to see all of the tracked SPCs.\n"); + printf("##################################################################\n\n"); + } + MPI_Barrier(MPI_COMM_WORLD); + + /* Determine the MPI_T pvar indices for the requested SPCs */ + index = xml_index = -1; + MPI_T_pvar_get_num(&num); + for(i = 0; i < num; i++) { + name_len = desc_len = 256; + rc = PMPI_T_pvar_get_info(i, name, &name_len, &verbosity, + &var_class, &datatype, &enumtype, description, &desc_len, &bind, + &readonly, &continuous, &atomic); + if( MPI_SUCCESS != rc ) + continue; + + if(strcmp(name, xml_counter) == 0) { + xml_index = i; + printf("[%d] %s -> %s\n", rank, name, description); + } + if(strcmp(name, counter_names[rank]) == 0) { + index = i; + printf("[%d] %s -> %s\n", rank, name, description); + } + } + + /* Make sure we found the counters */ + if(index == -1 || xml_index == -1) { + fprintf(stderr, "ERROR: Couldn't find the appropriate SPC counter in the MPI_T pvars.\n"); + MPI_Abort(MPI_COMM_WORLD, -1); + } + + int ret, xml_count; + long long *values = NULL; + char *xml_filename = (char*)malloc(128 * sizeof(char)); + + MPI_T_pvar_session session; + MPI_T_pvar_handle handle; + /* Create the MPI_T sessions/handles for the counters and start the counters */ + ret = MPI_T_pvar_session_create(&session); + ret = MPI_T_pvar_handle_alloc(session, index, NULL, &handle, &count); + ret = MPI_T_pvar_start(session, handle); + + values = (long long*)malloc(count * sizeof(long long)); + + MPI_T_pvar_session xml_session; + MPI_T_pvar_handle xml_handle; + if(xml_index >= 0) { + ret = MPI_T_pvar_session_create(&xml_session); + ret = MPI_T_pvar_handle_alloc(xml_session, xml_index, NULL, &xml_handle, &xml_count); + ret = MPI_T_pvar_start(xml_session, xml_handle); + } + + double timer = MPI_Wtime(); + message_exchange(num_messages, message_size); + timer = MPI_Wtime() - timer; + + printf("[%d] Elapsed time: %lf seconds\n", rank, timer); + + ret = MPI_T_pvar_read(session, handle, values); + if(xml_index >= 0) { + ret = MPI_T_pvar_read(xml_session, xml_handle, &xml_filename); + } + + /* Print the counter values in order by rank */ + for(i = 0; i < 2; i++) { + printf("\n"); + if(i == rank) { + if(xml_index >= 0) { + printf("[%d] XML Counter Value Read: %s\n", rank, xml_filename); + } + for(j = 0; j < count; j++) { + printf("[%d] %s Counter Value Read: %lld\n", rank, counter_names[rank], values[j]); + } + fflush(stdout); + } + MPI_Barrier(MPI_COMM_WORLD); + } + /* Stop the MPI_T session, free the handle, and then free the session */ + ret = MPI_T_pvar_stop(session, handle); + ret = MPI_T_pvar_handle_free(session, &handle); + ret = MPI_T_pvar_session_free(&session); + + MPI_T_finalize(); + MPI_Finalize(); + + return 0; +} +``` + +### Using SPCs Through the mmap Interface + +There is another interface other than MPI_T for accessing SPCs. In order to use this interface, you need to set the MCA parameter `ompi_mpi_spc_mmap_enabled` to true. This method uses mmap to create a memory region in which you can directly access SPCs after using mmap on a certain file. All the information necessary to attach to the correct file is dumped in an XML file for each rank in a shared system location (the default is /dev/shm) with the name `spc_data.[nodename].[SPC XML String or Open MPI jobid].[world rank].xml`. This XML file will have the format shown in the example below with the filename and file size for performing the mmap, and the number of counters and the clock frequency in MHz for ease of parsing (the clock frequency is needed for converting timer counters to microseconds from cycles). Each counter tag will have a name, and three offsets. The three offsets denote how many bytes that information is offset into the counter data. All counters have a value field, but only bin counters have an offset for the rules and bins (If the counter doesn't have bins, the values in these fields will be -1). + +#### XML File Example +```xml + + + /dev/shm/spc_data.c00.-860356607.0 + 5248 + 164 + 2127 + + OMPI_SPC_SEND + 0 + -1 + -1 + + + OMPI_SPC_BSEND + 8 + -1 + -1 + + + ... + + ... + +``` + +#### mmap Interface Usage in C +```c +/* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Simple example usage of SPCs through an mmap'd file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* This structure will help us store all of the offsets for each + * counter that we want to print out. + */ +typedef struct spc_s { + char name[128]; + int offset; + int rules_offset; + int bins_offset; +} spc_t; + +int main(int argc, char **argv) +{ + if(argc < 4) { + printf("Usage: ./spc_mmap_test [num_messages] [message_size] [XML string]\n"); + return -1; + } + + MPI_Init(NULL, NULL); + + int i, num_messages = atoi(argv[1]), message_size = atoi(argv[2]), rank, shm_fd; + char *buf = (char*)malloc(message_size * sizeof(char)); + + MPI_Request *requests = (MPI_Request*)malloc(num_messages * sizeof(MPI_Request)); + MPI_Status *statuses = (MPI_Status*)malloc(num_messages * sizeof(MPI_Status)); + MPI_Status status; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + int retval, shm_file_size, num_counters, freq_mhz; + long long value; + char filename[128], shm_filename[128], line[128], *token; + + char hostname[128]; + gethostname(hostname, 128); + + char *nodename; + nodename = strtok(hostname, "."); + + char *xml_string = argv[3]; + snprintf(filename, 128, "/dev/shm/spc_data.%s.%s.%d.xml", nodename, xml_string, rank); + + FILE *fptr = NULL; + void *data_ptr; + spc_t *spc_data; + + if(NULL == (fptr = fopen(filename, "r"))) { + printf("Couldn't open xml file.\n"); + MPI_Finalize(); + return -1; + } else { + printf("[%d] Successfully opened the XML file!\n", rank); + } + + /* The following is to read the formatted XML file to get the basic + * information we need to read the shared memory file and properly + * format some counters. + */ + char tmp_filename[128]; + fgets(line, 128, fptr); + fgets(line, 128, fptr); + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%s", shm_filename); + + if(rank == 0) { + printf("shm_filename: %s\n", shm_filename); + } + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &shm_file_size); + if(rank == 0) { + printf("shm_file_size: %d\n", shm_file_size); + } + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &num_counters); + if(rank == 0) { + printf("num_counters: %d\n", num_counters); + } + + fgets(line, 128, fptr); + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &freq_mhz); + if(rank == 0) { + printf("freq_mhz: %d\n", freq_mhz); + } + + if(-1 == (shm_fd = open(shm_filename, O_RDONLY))){ + printf("\nCould not open file '%s'... Error String: %s\n", shm_filename, strerror(errno)); + return -1; + } else { + if(MAP_FAILED == (data_ptr = mmap(0, 8192, PROT_READ, MAP_SHARED, shm_fd, 0))) { + printf("Map failed :(\n"); + return -1; + } + printf("Successfully mmap'd file!\n"); + } + + spc_data = (spc_t*)malloc(num_counters * sizeof(spc_t)); + + for(i = 0; i < num_counters; i++) { + fgets(line, 128, fptr); /* Counter begin header */ + /* This should never happen... */ + if(strcmp(line,"\n") == 0) { + printf("Parsing ended prematurely. There weren't enough counters.\n"); + break; + } + + fgets(line, 128, fptr); /* Counter name header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%s", spc_data[i].name); /* Counter name */ + + fgets(line, 128, fptr); /* Counter value offset header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &spc_data[i].offset); /* Counter offset */ + + fgets(line, 128, fptr); /* Counter rules offset header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &spc_data[i].rules_offset); /* Counter rules offset */ + + fgets(line, 128, fptr); /* Counter bins offset header */ + token = strtok(line, ">"); + token = strtok(NULL, "<"); + sscanf(token, "%d", &spc_data[i].bins_offset); /* Counter bins offset */ + + fgets(line, 128, fptr); /* Counter end header */ + } + + fclose(fptr); + + /* The following communication pattern is intended to cause a certain + * number of unexpected messages. + */ + if(rank==0) { + for(i=num_messages; i > 0; i--) { + MPI_Isend(buf, message_size, MPI_BYTE, 1, i, MPI_COMM_WORLD, &requests[i-1]); + } + MPI_Send(buf, message_size, MPI_BYTE, 1, 0, MPI_COMM_WORLD); + MPI_Waitall(num_messages, requests, statuses); + + MPI_Barrier(MPI_COMM_WORLD); + + for(i = 0; i < num_counters; i++) { + if((0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_TIME")) || (0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_QUEUE_TIME"))) { + value = (*((long long*)(data_ptr+spc_data[i].offset))) / freq_mhz; + } else { + value = *((long long*)(data_ptr+spc_data[i].offset)); + } + if(value > 0) + printf("[%d] %s\t%lld\n", rank, spc_data[i].name, value ); + } + MPI_Barrier(MPI_COMM_WORLD); + } else { + MPI_Recv(buf, message_size, MPI_BYTE, 0, 0, MPI_COMM_WORLD, &status); + for(i=0; i < num_messages; i++) { + MPI_Recv(buf, message_size, MPI_BYTE, 0, i+1, MPI_COMM_WORLD, &statuses[i]); + } + + MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + for(i = 0; i < num_counters; i++) { + /* These counters are stored in cycles, so we convert them to microseconds. + */ + if((0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_TIME")) || (0 == strcmp(spc_data[i].name, "OMPI_SPC_MATCH_QUEUE_TIME"))) { + value = (*((long long*)(data_ptr+spc_data[i].offset))) / freq_mhz; + } else { + value = *((long long*)(data_ptr+spc_data[i].offset)); + } + if(value > 0) { + printf("[%d] %s\t%lld\n", rank, spc_data[i].name, value ); + if(spc_data[i].rules_offset > 0) { + int j, *rules = (int*)(data_ptr+spc_data[i].rules_offset); + long long *bins = (long long*)(data_ptr+spc_data[i].bins_offset); + + for(j = 0; j < rules[0]; j++) { + if(j == rules[0]-1) { + printf("\t> %d\t", rules[j]); + } + else { + printf("\t<= %d\t", rules[j+1]); + } + printf("%lld\n", bins[j]); + } + } + } + } + } + + MPI_Finalize(); + + return 0; +} +``` + +#### Snapshot Feature in the mmap Interface +NOTE: This feature is currently in the process of being moved to an external tool, and will be unavailable until then. +The mmap interface also allows for collecting snapshots of the SPC counter values periodically throughout an execution through a snapshot feature. These snapshots use an MCA parameter to determine the length of time after which to create a copy of the SPC data file from the mmap interface. The snapshot data file copies simply append a timestamp to the end of the mmap data file to keep track of when that snapshot was taken. These snapshot files can be used to show how counter values change over time. + +The following is an example python script that takes the values from these snapshot files and creates heatmaps of the change in the counter values over time. This example script takes three command line arguments: a directory where all of the snapshot, XML, and original data files are stored; the XML string or Open MPI jobid to identify these data and XML files; a comma-separated list of SPCs to be used in creating the heatmaps. + +```python +import sys +import glob +import operator +import struct + +import numpy as np +import matplotlib +matplotlib.use('Agg') # For use with headless systems +import matplotlib.pyplot as plt +import matplotlib.cm as cm +import matplotlib.ticker as ticker + +def combine(filename, data): + f = open(filename, 'rb') + for i in range(0,num_counters): + temp = struct.unpack('l', f.read(8))[0] + if 'TIME' in names[i]: + temp /= freq_mhz + data[i].append(temp) + +def fmt(x, pos): + return '{:,.0f}'.format(x) + +# Make sure the proper number of arguments have been supplied +if len(sys.argv) < 4: + print("Usage: ./spc_snapshot_parse.py [/path/to/data/files] [datafile_label] [list_of_spcs]") + exit() + +path = sys.argv[1] +label = sys.argv[2] + +xml_filename = '' +# Lists for storing the snapshot data files from each rank +copies = [] +ends = [] +# Populate the lists with the appropriate data files +for filename in glob.glob(path + "/spc_data*"): + if label in filename: + if xml_filename == '' and '.xml' in filename: + xml_filename = filename + if '.xml' not in filename: + temp = filename.split('/')[-1].split('.') + if len(temp) < 5: + temp[-1] = int(temp[-1]) + ends.append(temp) + else: + temp[-1] = int(temp[-1]) + temp[-2] = int(temp[-2]) + copies.append(temp) + +# Sort the lists +ends = sorted(ends, key = operator.itemgetter(-1)) +for i in range(0,len(ends)): + ends[i][-1] = str(ends[i][-1]) +copies = sorted(copies, key = operator.itemgetter(-2,-1)) +for i in range(0,len(copies)): + copies[i][-1] = str(copies[i][-1]) + copies[i][-2] = str(copies[i][-2]) + +sep = '.' + +xml_file = open(xml_filename, 'r') +num_counters = 0 +freq_mhz = 0 +names = [] +base = [] +# Parse the XML file (same for all data files) +for line in xml_file: + if 'num_counters' in line: + num_counters = int(line.split('>')[1].split('<')[0]) + if 'freq_mhz' in line: + freq_mhz = int(line.split('>')[1].split('<')[0]) + if '' in line: + names.append(line.split('>')[1].split('<')[0]) + value = [names[-1]] + base.append(value) + +prev = copies[0] +i = 0 +ranks = [] +values = [] +times = [] +time = [] + +# Populate the data lists +for n in range(0,len(base)): + values.append([0, names[n]]) +for c in copies: + if c[-2] != prev[-2]: + filename = path + "/" + sep.join(ends[i]) + combine(filename, values) + + ranks.append(values) + times.append(time) + for j in range(0, len(names)): + temp = [ranks[0][j][0]] + + values = [] + time = [] + for n in range(0,len(base)): + values.append([i+1, names[n]]) + i += 1 + + filename = path + "/" + sep.join(c) + time.append(int(filename.split('.')[-1])) + combine(filename, values) + prev = c + +filename = path + "/" + sep.join(ends[i]) +combine(filename, values) +ranks.append(values) +times.append(time) + +spc_list = sys.argv[3].split(",") + +for i in range(0, len(names)): + fig = plt.figure(num=None, figsize=(7, 9), dpi=200, facecolor='w', edgecolor='k') + + plot = False + # Only plot the SPCs of interest + if names[i] in spc_list: + plot = True + + map_data = [] + avg_x = [] + + for j in range(0, len(ranks)): + if avg_x == None: + avg_x = np.zeros(len(times[j])-1) + empty = True + for k in range(2,len(ranks[j][i])): + if ranks[j][i][k] != 0: + empty = False + break + if not empty: + if plot: + xvals = [] + yvals = [] + for l in range(1, len(times[j])): + if ranks[j][i][l+2] - ranks[j][i][l+1] < 0: + break + xvals.append(times[j][l] - times[j][0]) + yvals.append(ranks[j][i][l+2] - ranks[j][i][l+1]) + + map_data.append(yvals) + for v in range(0,len(avg_x)): + avg_x[v] += xvals[v] + if plot: + for v in range(0,len(avg_x)): + avg_x[v] /= float(len(ranks)) + + ax = plt.gca() + im = ax.imshow(map_data, cmap='Reds', interpolation='nearest') + + cbar = ax.figure.colorbar(im, ax=ax, format=ticker.FuncFormatter(fmt)) + cbar.ax.set_ylabel("Counter Value", rotation=-90, va="bottom") + + plt.title(names[i] + ' Snapshot Difference') + + plt.xlabel('Time') + plt.ylabel('MPI Rank') + + ax.set_xticks(np.arange(len(avg_x))) + ax.set_yticks(np.arange(len(map_data))) + ax.set_xticklabels(avg_x) + + plt.show() + fig.savefig(names[i] + '.png') +``` + +### List of Counters + +|Name |Description| +| --- | --------- | +|OMPI_SPC_SEND|The number of times MPI_Send was called.| +|OMPI_SPC_BSEND|The number of times MPI_Bsend was called.| +|OMPI_SPC_RSEND|The number of times MPI_Rsend was called.| +|OMPI_SPC_SSEND|The number of times MPI_Ssend was called.| +|OMPI_SPC_RECV|The number of times MPI_Recv was called.| +|OMPI_SPC_MRECV|The number of times MPI_Mrecv was called.| +|OMPI_SPC_ISEND|The number of times MPI_Isend was called.| +|OMPI_SPC_IBSEND|The number of times MPI_Ibsend was called.| +|OMPI_SPC_IRSEND|The number of times MPI_Irsend was called.| +|OMPI_SPC_ISSEND|The number of times MPI_Issend was called.| +|OMPI_SPC_IRECV|The number of times MPI_Irecv was called.| +|OMPI_SPC_SENDRECV|The number of times MPI_Sendrecv was called.| +|OMPI_SPC_SENDRECV_REPLACE|The number of times MPI_Sendrecv_replace was called.| +|OMPI_SPC_PUT|The number of times MPI_Put was called.| +|OMPI_SPC_RPUT|The number of times MPI_Rput was called.| +|OMPI_SPC_GET|The number of times MPI_Get was called.| +|OMPI_SPC_RGET|The number of times MPI_Rget was called.| +|OMPI_SPC_PROBE|The number of times MPI_Probe was called.| +|OMPI_SPC_IPROBE|The number of times MPI_Iprobe was called.| +|OMPI_SPC_BCAST|The number of times MPI_Bcast was called.| +|OMPI_SPC_IBCAST|The number of times MPI_Ibcast was called.| +|OMPI_SPC_BCAST_INIT|The number of times MPIX_Bcast_init was called.| +|OMPI_SPC_REDUCE|The number of times MPI_Reduce was called.| +|OMPI_SPC_REDUCE_SCATTER|The number of times MPI_Reduce_scatter was called.| +|OMPI_SPC_REDUCE_SCATTER_BLOCK|The number of times MPI_Reduce_scatter_block was called.| +|OMPI_SPC_IREDUCE|The number of times MPI_Ireduce was called.| +|OMPI_SPC_IREDUCE_SCATTER|The number of times MPI_Ireduce_scatter was called.| +|OMPI_SPC_IREDUCE_SCATTER_BLOCK|The number of times MPI_Ireduce_scatter_block was called.| +|OMPI_SPC_REDUCE_INIT|The number of times MPIX_Reduce_init was called.| +|OMPI_SPC_REDUCE_SCATTER_INIT|The number of times MPIX_Reduce_scatter_init was called.| +|OMPI_SPC_REDUCE_SCATTER_BLOCK_INIT|The number of times MPIX_Reduce_scatter_block_init was called.| +|OMPI_SPC_ALLREDUCE|The number of times MPI_Allreduce was called.| +|OMPI_SPC_IALLREDUCE|The number of times MPI_Iallreduce was called.| +|OMPI_SPC_ALLREDUCE_INIT|The number of times MPIX_Allreduce_init was called.| +|OMPI_SPC_SCAN|The number of times MPI_Scan was called.| +|OMPI_SPC_EXSCAN|The number of times MPI_Exscan was called.| +|OMPI_SPC_ISCAN|The number of times MPI_Iscan was called.| +|OMPI_SPC_IEXSCAN|The number of times MPI_Iexscan was called.| +|OMPI_SPC_SCAN_INIT|The number of times MPIX_Scan_init was called.| +|OMPI_SPC_EXSCAN_INIT|The number of times MPIX_Exscan_init was called.| +|OMPI_SPC_SCATTER|The number of times MPI_Scatter was called.| +|OMPI_SPC_SCATTERV|The number of times MPI_Scatterv was called.| +|OMPI_SPC_ISCATTER|The number of times MPI_Iscatter was called.| +|OMPI_SPC_ISCATTERV|The number of times MPI_Iscatterv was called.| +|OMPI_SPC_SCATTER_INIT|The number of times MPIX_Scatter_init was called.| +|OMPI_SPC_SCATTERV_INIT|The number of times MPIX_Scatterv_init was called.| +|OMPI_SPC_GATHER|The number of times MPI_Gather was called.| +|OMPI_SPC_GATHERV|The number of times MPI_Gatherv was called.| +|OMPI_SPC_IGATHER|The number of times MPI_Igather was called.| +|OMPI_SPC_IGATHERV|The number of times MPI_Igatherv was called.| +|OMPI_SPC_GATHER_INIT|The number of times MPIX_Gather_init was called.| +|OMPI_SPC_GATHERV_INIT|The number of times MPIX_Gatherv_init was called.| +|OMPI_SPC_ALLTOALL|The number of times MPI_Alltoall was called.| +|OMPI_SPC_ALLTOALLV|The number of times MPI_Alltoallv was called.| +|OMPI_SPC_ALLTOALLW|The number of times MPI_Alltoallw was called.| +|OMPI_SPC_IALLTOALL|The number of times MPI_Ialltoall was called.| +|OMPI_SPC_IALLTOALLV|The number of times MPI_Ialltoallv was called.| +|OMPI_SPC_IALLTOALLW|The number of times MPI_Ialltoallw was called.| +|OMPI_SPC_ALLTOALL_INIT|The number of times MPIX_Alltoall_init was called.| +|OMPI_SPC_ALLTOALLV_INIT|The number of times MPIX_Alltoallv_init was called.| +|OMPI_SPC_ALLTOALLW_INIT|The number of times MPIX_Alltoallw_init was called.| +|OMPI_SPC_NEIGHBOR_ALLTOALL|The number of times MPI_Neighbor_alltoall was called.| +|OMPI_SPC_NEIGHBOR_ALLTOALLV|The number of times MPI_Neighbor_alltoallv was called.| +|OMPI_SPC_NEIGHBOR_ALLTOALLW|The number of times MPI_Neighbor_alltoallw was called.| +|OMPI_SPC_INEIGHBOR_ALLTOALL|The number of times MPI_Ineighbor_alltoall was called.| +|OMPI_SPC_INEIGHBOR_ALLTOALLV|The number of times MPI_Ineighbor_alltoallv was called.| +|OMPI_SPC_INEIGHBOR_ALLTOALLW|The number of times MPI_Ineighbor_alltoallw was called.| +|OMPI_SPC_NEIGHBOR_ALLTOALL_INIT|The number of times MPIX_Neighbor_alltoall_init was called.| +|OMPI_SPC_NEIGHBOR_ALLTOALLV_INIT|The number of times MPIX_Neighbor_alltoallv_init was called.| +|OMPI_SPC_NEIGHBOR_ALLTOALLW_INIT|The number of times MPIX_Neighbor_alltoallw_init was called.| +|OMPI_SPC_ALLGATHER|The number of times MPI_Allgather was called.| +|OMPI_SPC_ALLGATHERV|The number of times MPI_Allgatherv was called.| +|OMPI_SPC_IALLGATHER|The number of times MPI_Iallgather was called.| +|OMPI_SPC_IALLGATHERV|The number of times MPI_Iallgatherv was called.| +|OMPI_SPC_ALLGATHER_INIT|The number of times MPIX_Allgather_init was called.| +|OMPI_SPC_ALLGATHERV_INIT|The number of times MPIX_Allgatherv_init was called.| +|OMPI_SPC_NEIGHBOR_ALLGATHER|The number of times MPI_Neighbor_allgather was called.| +|OMPI_SPC_NEIGHBOR_ALLGATHERV|The number of times MPI_Neighbor_allgatherv was called.| +|OMPI_SPC_INEIGHBOR_ALLGATHER|The number of times MPI_Ineighbor_allgather was called.| +|OMPI_SPC_INEIGHBOR_ALLGATHERV|The number of times MPI_Ineighbor_allgatherv was called.| +|OMPI_SPC_NEIGHBOR_ALLGATHER_INIT|The number of times MPIX_Neighbor_allgather_init was called.| +|OMPI_SPC_NEIGHBOR_ALLGATHERV_INIT|The number of times MPIX_Neighbor_allgatherv_init was called.| +|OMPI_SPC_TEST|The number of times MPI_Test was called.| +|OMPI_SPC_TESTALL|The number of times MPI_Testall was called.| +|OMPI_SPC_TESTANY|The number of times MPI_Testany was called.| +|OMPI_SPC_TESTSOME|The number of times MPI_Testsome was called.| +|OMPI_SPC_WAIT|The number of times MPI_Wait was called.| +|OMPI_SPC_WAITALL|The number of times MPI_Waitall was called.| +|OMPI_SPC_WAITANY|The number of times MPI_Waitany was called.| +|OMPI_SPC_WAITSOME|The number of times MPI_Waitsome was called.| +|OMPI_SPC_BARRIER|The number of times MPI_Barrier was called.| +|OMPI_SPC_IBARRIER|The number of times MPI_Ibarrier was called.| +|OMPI_SPC_BARRIER_INIT|The number of times MPIX_Barrier_init was called.| +|OMPI_SPC_WTIME|The number of times MPI_Wtime was called.| +|OMPI_SPC_CANCEL|The number of times MPI_Cancel was called.| +|OMPI_SPC_BYTES_RECEIVED_USER|The number of bytes received by the user through point-to-point communications. Note: Includes bytes transferred using internal RMA operations.| +|OMPI_SPC_BYTES_RECEIVED_MPI|The number of bytes received by MPI through collective| +|OMPI_SPC_BYTES_SENT_USER|The number of bytes sent by the user through point-to-point communications. Note: Includes bytes transferred using internal RMA operations.| +|OMPI_SPC_BYTES_SENT_MPI|The number of bytes sent by MPI through collective| +|OMPI_SPC_BYTES_PUT|The number of bytes sent/received using RMA Put operations both through user-level Put functions and internal Put functions.| +|OMPI_SPC_BYTES_GET|The number of bytes sent/received using RMA Get operations both through user-level Get functions and internal Get functions.| +|OMPI_SPC_UNEXPECTED|The number of messages that arrived as unexpected messages.| +|OMPI_SPC_OUT_OF_SEQUENCE|The number of messages that arrived out of the proper sequence.| +|OMPI_SPC_OOS_QUEUE_HOPS|The number of times we jumped to the next element in the out of sequence message queue's ordered list.| +|OMPI_SPC_MATCH_TIME|The amount of time | +|OMPI_SPC_MATCH_QUEUE_TIME|The amount of time | +|OMPI_SPC_OOS_MATCH_TIME|The amount of time | +|OMPI_SPC_OOS_MATCH_QUEUE_TIME|The amount of time | +|OMPI_SPC_UNEXPECTED_IN_QUEUE|The number of messages that are currently in the unexpected message queue| +|OMPI_SPC_OOS_IN_QUEUE|The number of messages that are currently in the out of sequence message queue| +|OMPI_SPC_MAX_UNEXPECTED_IN_QUEUE|The maximum number of messages that the unexpected message queue| +|OMPI_SPC_MAX_OOS_IN_QUEUE|The maximum number of messages that the out of sequence message queue| +|OMPI_SPC_BASE_BCAST_LINEAR|The number of times the base broadcast used the linear algorithm.| +|OMPI_SPC_BASE_BCAST_CHAIN|The number of times the base broadcast used the chain algorithm.| +|OMPI_SPC_BASE_BCAST_PIPELINE|The number of times the base broadcast used the pipeline algorithm.| +|OMPI_SPC_BASE_BCAST_SPLIT_BINTREE|The number of times the base broadcast used the split binary tree algorithm.| +|OMPI_SPC_BASE_BCAST_BINTREE|The number of times the base broadcast used the binary tree algorithm.| +|OMPI_SPC_BASE_BCAST_BINOMIAL|The number of times the base broadcast used the binomial algorithm.| +|OMPI_SPC_BASE_REDUCE_CHAIN|The number of times the base reduce used the chain algorithm.| +|OMPI_SPC_BASE_REDUCE_PIPELINE|The number of times the base reduce used the pipeline algorithm.| +|OMPI_SPC_BASE_REDUCE_BINARY|The number of times the base reduce used the binary tree algorithm.| +|OMPI_SPC_BASE_REDUCE_BINOMIAL|The number of times the base reduce used the binomial tree algorithm.| +|OMPI_SPC_BASE_REDUCE_IN_ORDER_BINTREE|The number of times the base reduce used the in order binary tree algorithm.| +|OMPI_SPC_BASE_REDUCE_LINEAR|The number of times the base reduce used the basic linear algorithm.| +|OMPI_SPC_BASE_REDUCE_SCATTER_NONOVERLAPPING|The number of times the base reduce scatter used the nonoverlapping algorithm.| +|OMPI_SPC_BASE_REDUCE_SCATTER_RECURSIVE_HALVING|The number of times the base reduce scatter used the recursive halving algorithm.| +|OMPI_SPC_BASE_REDUCE_SCATTER_RING|The number of times the base reduce scatter used the ring algorithm.| +|OMPI_SPC_BASE_ALLREDUCE_NONOVERLAPPING|The number of times the base allreduce used the nonoverlapping algorithm.| +|OMPI_SPC_BASE_ALLREDUCE_RECURSIVE_DOUBLING|The number of times the base allreduce used the recursive doubling algorithm.| +|OMPI_SPC_BASE_ALLREDUCE_RING|The number of times the base allreduce used the ring algorithm.| +|OMPI_SPC_BASE_ALLREDUCE_RING_SEGMENTED|The number of times the base allreduce used the segmented ring algorithm.| +|OMPI_SPC_BASE_ALLREDUCE_LINEAR|The number of times the base allreduce used the linear algorithm.| +|OMPI_SPC_BASE_SCATTER_BINOMIAL|The number of times the base scatter used the binomial tree algorithm.| +|OMPI_SPC_BASE_SCATTER_LINEAR|The number of times the base scatter used the linear algorithm.| +|OMPI_SPC_BASE_GATHER_BINOMIAL|The number of times the base gather used the binomial tree algorithm.| +|OMPI_SPC_BASE_GATHER_LINEAR_SYNC|The number of times the base gather used the synchronous linear algorithm.| +|OMPI_SPC_BASE_GATHER_LINEAR|The number of times the base gather used the linear algorithm.| +|OMPI_SPC_BASE_ALLTOALL_INPLACE|The number of times the base alltoall used the in-place algorithm.| +|OMPI_SPC_BASE_ALLTOALL_PAIRWISE|The number of times the base alltoall used the pairwise algorithm.| +|OMPI_SPC_BASE_ALLTOALL_BRUCK|The number of times the base alltoall used the bruck algorithm.| +|OMPI_SPC_BASE_ALLTOALL_LINEAR_SYNC|The number of times the base alltoall used the synchronous linear algorithm.| +|OMPI_SPC_BASE_ALLTOALL_TWO_PROCS|The number of times the base alltoall used the two process algorithm.| +|OMPI_SPC_BASE_ALLTOALL_LINEAR|The number of times the base alltoall used the linear algorithm.| +|OMPI_SPC_BASE_ALLGATHER_BRUCK|The number of times the base allgather used the bruck algorithm.| +|OMPI_SPC_BASE_ALLGATHER_RECURSIVE_DOUBLING|The number of times the base allgather used the recursive doubling algorithm.| +|OMPI_SPC_BASE_ALLGATHER_RING|The number of times the base allgather used the ring algorithm.| +|OMPI_SPC_BASE_ALLGATHER_NEIGHBOR_EXCHANGE|The number of times the base allgather used the neighbor exchange algorithm.| +|OMPI_SPC_BASE_ALLGATHER_TWO_PROCS|The number of times the base allgather used the two process algorithm.| +|OMPI_SPC_BASE_ALLGATHER_LINEAR|The number of times the base allgather used the linear algorithm.| +|OMPI_SPC_BASE_BARRIER_DOUBLE_RING|The number of times the base barrier used the double ring algorithm.| +|OMPI_SPC_BASE_BARRIER_RECURSIVE_DOUBLING|The number of times the base barrier used the recursive doubling algorithm.| +|OMPI_SPC_BASE_BARRIER_BRUCK|The number of times the base barrier used the bruck algorithm.| +|OMPI_SPC_BASE_BARRIER_TWO_PROCS|The number of times the base barrier used the two process algorithm.| +|OMPI_SPC_BASE_BARRIER_LINEAR|The number of times the base barrier used the linear algorithm.| +|OMPI_SPC_BASE_BARRIER_TREE|The number of times the base barrier used the tree algorithm.| +|OMPI_SPC_P2P_MESSAGE_SIZE|This is a bin counter with two subcounters. The first is messages that are less than or equal to mpi_spc_p2p_message_boundary bytes and the second is those that are larger than mpi_spc_p2p_message_boundary bytes.| +|OMPI_SPC_EAGER_MESSAGES|The number of messages that fall within the eager size.| +|OMPI_SPC_NOT_EAGER_MESSAGES|The number of messages that do not fall within the eager size.| +|OMPI_SPC_QUEUE_ALLOCATION|The amount of memory allocated after runtime currently in use for temporary message queues like the unexpected message queue and the out of sequence message queue.| +|OMPI_SPC_MAX_QUEUE_ALLOCATION|The maximum amount of memory allocated after runtime at one point for temporary message queues like the unexpected message queue and the out of sequence message queue. Note: The OMPI_SPC_QUEUE_ALLOCATION counter must also be activated.| +|OMPI_SPC_UNEXPECTED_QUEUE_DATA|The amount of memory currently in use for the unexpected message queue.| +|OMPI_SPC_MAX_UNEXPECTED_QUEUE_DATA|The maximum amount of memory in use for the unexpected message queue. Note: The OMPI_SPC_UNEXPECTED_QUEUE_DATA counter must also be activated.| +|OMPI_SPC_OOS_QUEUE_DATA|The amount of memory currently in use for the out-of-sequence message queue.| +|OMPI_SPC_MAX_OOS_QUEUE_DATA|The maximum amount of memory in use for the out-of-sequence message queue. Note: The OMPI_SPC_OOS_QUEUE_DATA counter must also be activated.| diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h index 194ac060da1..4700813ba49 100644 --- a/ompi/runtime/params.h +++ b/ompi/runtime/params.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2018 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -26,6 +26,9 @@ #define OMPI_RUNTIME_PARAMS_H #include "ompi_config.h" +#if SPC_ENABLE == 1 +#include "ompi_spc.h" +#endif BEGIN_C_DECLS @@ -141,20 +144,6 @@ OMPI_DECLSPEC extern bool ompi_async_mpi_init; /* EXPERIMENTAL: do not perform an RTE barrier at the beginning of MPI_Finalize */ OMPI_DECLSPEC extern bool ompi_async_mpi_finalize; -/** - * A comma delimited list of SPC counters to turn on or 'attach'. To turn - * all counters on, the string can be simply "all". An empty string will - * keep all counters turned off. - */ -OMPI_DECLSPEC extern char * ompi_mpi_spc_attach_string; - -/** - * A boolean value that determines whether or not to dump the SPC counter - * values in MPI_Finalize. A value of true dumps the counters and false does not. - */ -OMPI_DECLSPEC extern bool ompi_mpi_spc_dump_enabled; - - /** * Register MCA parameters used by the MPI layer. *