diff --git a/src/common/cpu.c b/src/common/cpu.c index 946c9905..e3f6bcb5 100644 --- a/src/common/cpu.c +++ b/src/common/cpu.c @@ -34,6 +34,12 @@ int64_t get_freq(struct frequency* freq) { return freq->max; } +#ifdef ARCH_X86 +int64_t get_freq_pp(struct frequency* freq) { + return freq->max_pp; +} +#endif + #if defined(ARCH_X86) || defined(ARCH_PPC) char* get_str_cpu_name(struct cpuInfo* cpu, bool fcpuname) { #ifdef ARCH_X86 diff --git a/src/common/cpu.h b/src/common/cpu.h index aabcee7e..3e63e8bc 100644 --- a/src/common/cpu.h +++ b/src/common/cpu.h @@ -60,6 +60,11 @@ struct frequency { int32_t max; // Indicates if max frequency was measured bool measured; +#ifdef ARCH_X86 + // Max frequency when running vectorized code. + // Used only for peak performance computation. + int32_t max_pp; +#endif }; struct hypervisor { @@ -188,6 +193,8 @@ struct cpuInfo { #ifdef ARCH_X86 // The index of the first core in the module uint32_t first_core_id; + // The index of this module + uint32_t module_id; #endif #endif }; @@ -200,6 +207,9 @@ uint32_t get_nsockets(struct topology* topo); VENDOR get_cpu_vendor(struct cpuInfo* cpu); int64_t get_freq(struct frequency* freq); +#ifdef ARCH_X86 +int64_t get_freq_pp(struct frequency* freq); +#endif char* get_str_aes(struct cpuInfo* cpu); char* get_str_sha(struct cpuInfo* cpu); diff --git a/src/x86/cpuid.c b/src/x86/cpuid.c index dd67ee09..dfa9aa19 100644 --- a/src/x86/cpuid.c +++ b/src/x86/cpuid.c @@ -210,18 +210,14 @@ int64_t get_peak_performance(struct cpuInfo* cpu, bool accurate_pp) { for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) { struct topology* topo = ptr->topo; - int64_t max_freq = get_freq(ptr->freq); + int64_t freq = get_freq(ptr->freq); - int64_t freq; #ifdef __linux__ if(accurate_pp) - freq = measure_frequency(ptr); - else - freq = max_freq; + freq = get_freq_pp(ptr->freq); #else // Silence compiler warning (void)(accurate_pp); - freq = max_freq; #endif //First, check we have consistent data @@ -450,6 +446,23 @@ int32_t get_core_type(void) { } } +#ifdef __linux__ +// Gets the max frequency for estimating the peak performance, +// filling in the passed cpuInfo parameter with this information. +void fill_frequency_info_pp(struct cpuInfo* cpu) { + int32_t unused; + int32_t *max_freq_pp_vec = malloc(sizeof(int32_t) * cpu->num_cpus); + struct cpuInfo* ptr = cpu; + + for (uint32_t i=0; i < cpu->num_cpus; i++) { + set_cpu_module(i, cpu->num_cpus, &unused); + + ptr->freq->max_pp = measure_frequency(ptr, max_freq_pp_vec); + ptr = ptr->next_cpu; + } +} +#endif + struct cpuInfo* get_cpu_info(void) { struct cpuInfo* cpu = emalloc(sizeof(struct cpuInfo)); cpu->peak_performance = -1; @@ -546,6 +559,7 @@ struct cpuInfo* get_cpu_info(void) { ptr->core_type = get_core_type(); } ptr->first_core_id = first_core; + ptr->module_id = i; ptr->feat = get_features_info(ptr); ptr->arch = get_cpu_uarch(ptr); @@ -570,6 +584,13 @@ struct cpuInfo* get_cpu_info(void) { if(ptr->topo == NULL) return cpu; } +#ifdef __linux__ + // If accurate_pp is requested, we need to get the max frequency + // after fetching the topology for all CPU modules, since the topology + // is required by fill_frequency_info_pp + if (accurate_pp()) fill_frequency_info_pp(cpu); +#endif + cpu->peak_performance = get_peak_performance(cpu, accurate_pp()); return cpu; @@ -1005,6 +1026,7 @@ struct frequency* get_frequency_info(struct cpuInfo* cpu) { } #endif + freq->max_pp = UNKNOWN_DATA; return freq; } diff --git a/src/x86/freq/freq.c b/src/x86/freq/freq.c index 615b66ab..308b8aff 100644 --- a/src/x86/freq/freq.c +++ b/src/x86/freq/freq.c @@ -21,9 +21,12 @@ #define FREQ_VECTOR_SIZE 1<<16 struct freq_thread { + // Inputs + struct cpuInfo* cpu; bool end; bool measure; - double freq; + // Output + int32_t *max_pp; }; double vector_average_harmonic(double* v, int len) { @@ -48,6 +51,7 @@ void* measure_freq(void *freq_ptr) { char* line = NULL; size_t len = 0; ssize_t read; + struct cpuInfo* cpu = freq->cpu; int v = 0; double* freq_vector = malloc(sizeof(double) * FREQ_VECTOR_SIZE); @@ -76,18 +80,43 @@ void* measure_freq(void *freq_ptr) { sleep_ms(500); } - freq->freq = vector_average_harmonic(freq_vector, v); - printWarn("AVX2 measured freq=%f\n", freq->freq); + if (cpu->hybrid_flag) { + // We have an heterogeneous architecture. After measuring the + // frequency for all cores, we now need to compute the average + // independently for each CPU module. + struct cpuInfo* ptr = cpu; + double* freq_vector_ptr = freq_vector; + + for (int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) { + freq->max_pp[i] = vector_average_harmonic(freq_vector_ptr, ptr->topo->total_cores_module); + printWarn("AVX2 measured freq=%d (module %d)", freq->max_pp[i], i); + + freq_vector_ptr = freq_vector_ptr + ptr->topo->total_cores_module; + } + } + else { + freq->max_pp[0] = vector_average_harmonic(freq_vector, v); + printWarn("AVX2 measured freq=%d\n", freq->max_pp[0]); + } return NULL; } -int64_t measure_frequency(struct cpuInfo* cpu) { +int32_t measure_frequency(struct cpuInfo* cpu, int32_t *max_freq_pp_vec) { + if (cpu->hybrid_flag && cpu->module_id > 0) { + // We have a hybrid architecture and we have already + // measured the frequency for this module in a previous + // call to this function, so now just return it. + return max_freq_pp_vec[cpu->module_id]; + } + int ret; int num_spaces; struct freq_thread* freq_struct = malloc(sizeof(struct freq_thread)); freq_struct->end = false; freq_struct->measure = false; + freq_struct->cpu = cpu; + freq_struct->max_pp = max_freq_pp_vec; void* (*compute_function)(void*); @@ -159,5 +188,5 @@ int64_t measure_frequency(struct cpuInfo* cpu) { } printf("\r%*c", num_spaces, ' '); - return freq_struct->freq; + return max_freq_pp_vec[0]; } diff --git a/src/x86/freq/freq.h b/src/x86/freq/freq.h index de62916c..cc6f98cd 100644 --- a/src/x86/freq/freq.h +++ b/src/x86/freq/freq.h @@ -8,6 +8,6 @@ #define MEASURE_TIME_SECONDS 5 #define LOOP_ITERS 100000000 -int64_t measure_frequency(struct cpuInfo* cpu); +int32_t measure_frequency(struct cpuInfo* cpu, int32_t *max_freq_pp_vec); #endif