Skip to content

Commit

Permalink
[v1.06][X86] Fix accurate-pp in hybrid architectures (fixes #169)
Browse files Browse the repository at this point in the history
Overview of changes:
- Adds field max_pp in frequency struct to hold the max freq for peak-performance estimation.
- Instead of getting the max frequency in get_peak_performance, we get it in get_cpu_info (more natural).
- Adds fill_frequency_info_pp which fills the max_pp of the passed cpu by calling measure_frequency.

The approach is to call measure_frequency with a vector where the max frequencies are stored. Then,
the first time measure_frequency is called, the frequency is measured while running all the cores,
and the max frequency is computed per module (e.g., in the case of 2 modules, we would compute
the freq for the first and for the second module), and saved into this vector. Subsequent calls to
measure_frequency will just read the corresponding value for the vector. In other words, the frequency
is only measured once for the whole CPU.
  • Loading branch information
Dr-Noob committed Sep 10, 2024
1 parent edbfc97 commit ab43a11
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 12 deletions.
6 changes: 6 additions & 0 deletions src/common/cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ int64_t get_freq(struct frequency* freq) {
return freq->max;
}

#ifdef ARCH_X86
int64_t get_freq_pp(struct frequency* freq) {
return freq->max_pp;
}
#endif

#if defined(ARCH_X86) || defined(ARCH_PPC)
char* get_str_cpu_name(struct cpuInfo* cpu, bool fcpuname) {
#ifdef ARCH_X86
Expand Down
10 changes: 10 additions & 0 deletions src/common/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ struct frequency {
int32_t max;
// Indicates if max frequency was measured
bool measured;
#ifdef ARCH_X86
// Max frequency when running vectorized code.
// Used only for peak performance computation.
int32_t max_pp;
#endif
};

struct hypervisor {
Expand Down Expand Up @@ -188,6 +193,8 @@ struct cpuInfo {
#ifdef ARCH_X86
// The index of the first core in the module
uint32_t first_core_id;
// The index of this module
uint32_t module_id;
#endif
#endif
};
Expand All @@ -200,6 +207,9 @@ uint32_t get_nsockets(struct topology* topo);

VENDOR get_cpu_vendor(struct cpuInfo* cpu);
int64_t get_freq(struct frequency* freq);
#ifdef ARCH_X86
int64_t get_freq_pp(struct frequency* freq);
#endif

char* get_str_aes(struct cpuInfo* cpu);
char* get_str_sha(struct cpuInfo* cpu);
Expand Down
34 changes: 28 additions & 6 deletions src/x86/cpuid.c
Original file line number Diff line number Diff line change
Expand Up @@ -210,18 +210,14 @@ int64_t get_peak_performance(struct cpuInfo* cpu, bool accurate_pp) {

for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) {
struct topology* topo = ptr->topo;
int64_t max_freq = get_freq(ptr->freq);
int64_t freq = get_freq(ptr->freq);

int64_t freq;
#ifdef __linux__
if(accurate_pp)
freq = measure_frequency(ptr);
else
freq = max_freq;
freq = get_freq_pp(ptr->freq);
#else
// Silence compiler warning
(void)(accurate_pp);
freq = max_freq;
#endif

//First, check we have consistent data
Expand Down Expand Up @@ -450,6 +446,23 @@ int32_t get_core_type(void) {
}
}

#ifdef __linux__
// Gets the max frequency for estimating the peak performance,
// filling in the passed cpuInfo parameter with this information.
void fill_frequency_info_pp(struct cpuInfo* cpu) {
int32_t unused;
int32_t *max_freq_pp_vec = malloc(sizeof(int32_t) * cpu->num_cpus);
struct cpuInfo* ptr = cpu;

for (uint32_t i=0; i < cpu->num_cpus; i++) {
set_cpu_module(i, cpu->num_cpus, &unused);

ptr->freq->max_pp = measure_frequency(ptr, max_freq_pp_vec);
ptr = ptr->next_cpu;
}
}
#endif

struct cpuInfo* get_cpu_info(void) {
struct cpuInfo* cpu = emalloc(sizeof(struct cpuInfo));
cpu->peak_performance = -1;
Expand Down Expand Up @@ -546,6 +559,7 @@ struct cpuInfo* get_cpu_info(void) {
ptr->core_type = get_core_type();
}
ptr->first_core_id = first_core;
ptr->module_id = i;
ptr->feat = get_features_info(ptr);

ptr->arch = get_cpu_uarch(ptr);
Expand All @@ -570,6 +584,13 @@ struct cpuInfo* get_cpu_info(void) {
if(ptr->topo == NULL) return cpu;
}

#ifdef __linux__
// If accurate_pp is requested, we need to get the max frequency
// after fetching the topology for all CPU modules, since the topology
// is required by fill_frequency_info_pp
if (accurate_pp()) fill_frequency_info_pp(cpu);
#endif

cpu->peak_performance = get_peak_performance(cpu, accurate_pp());

return cpu;
Expand Down Expand Up @@ -1005,6 +1026,7 @@ struct frequency* get_frequency_info(struct cpuInfo* cpu) {
}
#endif

freq->max_pp = UNKNOWN_DATA;
return freq;
}

Expand Down
39 changes: 34 additions & 5 deletions src/x86/freq/freq.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,12 @@
#define FREQ_VECTOR_SIZE 1<<16

struct freq_thread {
// Inputs
struct cpuInfo* cpu;
bool end;
bool measure;
double freq;
// Output
int32_t *max_pp;
};

double vector_average_harmonic(double* v, int len) {
Expand All @@ -48,6 +51,7 @@ void* measure_freq(void *freq_ptr) {
char* line = NULL;
size_t len = 0;
ssize_t read;
struct cpuInfo* cpu = freq->cpu;

int v = 0;
double* freq_vector = malloc(sizeof(double) * FREQ_VECTOR_SIZE);
Expand Down Expand Up @@ -76,18 +80,43 @@ void* measure_freq(void *freq_ptr) {
sleep_ms(500);
}

freq->freq = vector_average_harmonic(freq_vector, v);
printWarn("AVX2 measured freq=%f\n", freq->freq);
if (cpu->hybrid_flag) {
// We have an heterogeneous architecture. After measuring the
// frequency for all cores, we now need to compute the average
// independently for each CPU module.
struct cpuInfo* ptr = cpu;
double* freq_vector_ptr = freq_vector;

for (int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) {
freq->max_pp[i] = vector_average_harmonic(freq_vector_ptr, ptr->topo->total_cores_module);
printWarn("AVX2 measured freq=%d (module %d)", freq->max_pp[i], i);

freq_vector_ptr = freq_vector_ptr + ptr->topo->total_cores_module;
}
}
else {
freq->max_pp[0] = vector_average_harmonic(freq_vector, v);
printWarn("AVX2 measured freq=%d\n", freq->max_pp[0]);
}

return NULL;
}

int64_t measure_frequency(struct cpuInfo* cpu) {
int32_t measure_frequency(struct cpuInfo* cpu, int32_t *max_freq_pp_vec) {
if (cpu->hybrid_flag && cpu->module_id > 0) {
// We have a hybrid architecture and we have already
// measured the frequency for this module in a previous
// call to this function, so now just return it.
return max_freq_pp_vec[cpu->module_id];
}

int ret;
int num_spaces;
struct freq_thread* freq_struct = malloc(sizeof(struct freq_thread));
freq_struct->end = false;
freq_struct->measure = false;
freq_struct->cpu = cpu;
freq_struct->max_pp = max_freq_pp_vec;

void* (*compute_function)(void*);

Expand Down Expand Up @@ -159,5 +188,5 @@ int64_t measure_frequency(struct cpuInfo* cpu) {
}

printf("\r%*c", num_spaces, ' ');
return freq_struct->freq;
return max_freq_pp_vec[0];
}
2 changes: 1 addition & 1 deletion src/x86/freq/freq.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@
#define MEASURE_TIME_SECONDS 5
#define LOOP_ITERS 100000000

int64_t measure_frequency(struct cpuInfo* cpu);
int32_t measure_frequency(struct cpuInfo* cpu, int32_t *max_freq_pp_vec);

#endif

0 comments on commit ab43a11

Please sign in to comment.