Skip to content

Commit 6083af1

Browse files
committed
[prof] in gux_taptamggux.mad counters.h, improve the handling of counter overhead
These are the results (1) keep overhead ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp [COUNTERS] *** USING RDTSC-BASED TIMERS (do not remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 4.4766s [COUNTERS] Fortran Other ( 0 ) : 0.1202s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0685s [COUNTERS] Fortran PhaseSpaceSampling ( 3 ) : 3.2400s for 1087437 events => throughput is 3.36E+05 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.1007s for 32768 events => throughput is 3.25E+05 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1673s for 16384 events => throughput is 9.79E+04 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0521s for 16384 events => throughput is 3.14E+05 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0687s for 16384 events => throughput is 2.38E+05 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.1237s for 1087437 events => throughput is 8.79E+06 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4728s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0269s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s [COUNTERS] TEST SampleGetX ( 21 ) : 2.3496s for 14136681 events => throughput is 6.02E+06 events/s [COUNTERS] OVERALL NON-MEs ( 31 ) : 4.4409s [COUNTERS] OVERALL MEs ( 32 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s CUDACPP_RUNTIME_USECHRONOTIMERS=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp [COUNTERS] *** USING STD::CHRONO TIMERS (do not remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 5.3144s [COUNTERS] Fortran Other ( 0 ) : 0.1588s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0674s [COUNTERS] Fortran PhaseSpaceSampling ( 3 ) : 4.0191s for 1087437 events => throughput is 2.71E+05 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.0996s for 32768 events => throughput is 3.29E+05 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1660s for 16384 events => throughput is 9.87E+04 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0508s for 16384 events => throughput is 3.22E+05 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0704s for 16384 events => throughput is 2.33E+05 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.1482s for 1087437 events => throughput is 7.34E+06 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4718s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0267s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s [COUNTERS] TEST SampleGetX ( 21 ) : 2.8646s for 14136681 events => throughput is 4.94E+06 events/s [COUNTERS] OVERALL NON-MEs ( 31 ) : 5.2787s [COUNTERS] OVERALL MEs ( 32 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s (2) remove overhead CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp INFO: COUNTERS overhead : 0.0338s for 1M start/stop cycles [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD : 4.8244s [COUNTERS] PROGRAM COUNTEROVERHEAD : 0.8905s ------------------------------------------------------------- [COUNTERS] *** USING RDTSC-BASED TIMERS (remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 3.9339s [COUNTERS] Fortran Other ( 0 ) : 0.2954s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0674s [COUNTERS] Fortran PhaseSpaceSampling ( 3 ) : 2.7332s for 1087437 events => throughput is 3.98E+05 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.1003s for 32768 events => throughput is 3.27E+05 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1688s for 16384 events => throughput is 9.71E+04 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0507s for 16384 events => throughput is 3.23E+05 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0695s for 16384 events => throughput is 2.36E+05 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.0924s for 1087437 events => throughput is 1.18E+07 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4692s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0263s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s [COUNTERS] TEST SampleGetX ( 21 ) : 1.8723s for 14136681 events => throughput is 7.55E+06 events/s [COUNTERS] OVERALL NON-MEs ( 31 ) : 3.8982s [COUNTERS] OVERALL MEs ( 32 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s CUDACPP_RUNTIME_USECHRONOTIMERS=1 CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp INFO: COUNTERS overhead : 0.0637s for 1M start/stop cycles [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD : 5.8826s [COUNTERS] PROGRAM COUNTEROVERHEAD : 1.6786s ------------------------------------------------------------- [COUNTERS] *** USING STD::CHRONO TIMERS (remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 4.2040s [COUNTERS] Fortran Other ( 0 ) : 0.4831s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0691s [COUNTERS] Fortran PhaseSpaceSampling ( 3 ) : 2.9924s for 1087437 events => throughput is 3.63E+05 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.0983s for 32768 events => throughput is 3.33E+05 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1669s for 16384 events => throughput is 9.81E+04 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0506s for 16384 events => throughput is 3.24E+05 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0676s for 16384 events => throughput is 2.42E+05 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.0698s for 1087437 events => throughput is 1.56E+07 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4712s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0267s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0350s for 16384 events => throughput is 4.68E+05 events/s [COUNTERS] TEST SampleGetX ( 21 ) : 1.9227s for 14136681 events => throughput is 7.35E+06 events/s [COUNTERS] OVERALL NON-MEs ( 31 ) : 4.1690s [COUNTERS] OVERALL MEs ( 32 ) : 0.0350s for 16384 events => throughput is 4.68E+05 events/s (3) remove overhead, disable individual timers (so here the overhead is 0) CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 CUDACPP_RUNTIME_DISABLECALLTIMERS=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp INFO: COUNTERS overhead : 0.0333s for 1M start/stop cycles [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD : 4.1897s [COUNTERS] PROGRAM COUNTEROVERHEAD : 0.3330s ------------------------------------------------------------- [COUNTERS] *** USING RDTSC-BASED TIMERS (remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 3.8567s CUDACPP_RUNTIME_USECHRONOTIMERS=1 CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 CUDACPP_RUNTIME_DISABLECALLTIMERS=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp INFO: COUNTERS overhead : 0.0659s for 1M start/stop cycles [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD : 4.5119s [COUNTERS] PROGRAM COUNTEROVERHEAD : 0.6594s ------------------------------------------------------------- [COUNTERS] *** USING STD::CHRONO TIMERS (remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 3.8525s
1 parent 3577a55 commit 6083af1

1 file changed

Lines changed: 12 additions & 10 deletions

File tree

  • epochX/cudacpp/gux_taptamggux.mad/SubProcesses

epochX/cudacpp/gux_taptamggux.mad/SubProcesses/counters.cc

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -171,13 +171,11 @@ extern "C"
171171
void counters_initialise_()
172172
{
173173
using namespace counters;
174-
if( getenv( "CUDACPP_RUNTIME_DISABLECALLTIMERS" ) ) disablecalltimers = true;
175-
if( getenv( "CUDACPP_RUNTIME_DISABLETESTTIMERS" ) ) disabletesttimers = true;
176174
#ifdef MGONGPU_HASRDTSC
177175
if( getenv( "CUDACPP_RUNTIME_USECHRONOTIMERS" ) ) usechronotimers = true;
178176
#endif
179177
if( getenv( "CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD" ) ) removetimeroverhead = true;
180-
for( int icounter = 0; icounter < NCOUNTERSMAX + 3; icounter++ )
178+
for( int icounter = 0; icounter < NCOUNTERSMAX + 4; icounter++ ) // include icalibcounter = NCOUNTERSMAX+3
181179
{
182180
array_tags[icounter] = ""; // ensure that this is initialized to ""
183181
array_istesttimer[icounter] = false; // ensure that this is initialized to false
@@ -193,7 +191,7 @@ extern "C"
193191
counters_register_counter_( &icalibcounter, "OVERHEAD CALIBRATION" );
194192
mgOnGpu::ChronoTimer<std::chrono::high_resolution_clock> calibtimer;
195193
calibtimer.start();
196-
constexpr size_t ncall = 1000000;
194+
constexpr size_t ncall = 10000000; // 10M calls are expected to take slightly less than ~1s (this will be in counter overhead)
197195
for( size_t icall = 0; icall < ncall; icall++ )
198196
{
199197
counters_start_counter_( &icalibcounter, &nevtdummy );
@@ -202,6 +200,8 @@ extern "C"
202200
calibtimer.stop();
203201
overheadpercallseconds = calibtimer.getTotalDurationSeconds() / ncall;
204202
}
203+
if( getenv( "CUDACPP_RUNTIME_DISABLECALLTIMERS" ) ) disablecalltimers = true;
204+
if( getenv( "CUDACPP_RUNTIME_DISABLETESTTIMERS" ) ) disabletesttimers = true;
205205
return;
206206
}
207207

@@ -216,10 +216,12 @@ extern "C"
216216
float program_totaltime = ( usechronotimers ? program_chronotimer.getTotalDurationSeconds() : program_rdtsctimer.getTotalDurationSeconds() );
217217
float program_overhead = 0;
218218
// Extract time duration from all timers
219-
float array_totaltimes[NCOUNTERSMAX + 3] = { 0 };
220-
float array_overheads[NCOUNTERSMAX + 3] = { 0 };
221-
for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ )
219+
float array_totaltimes[NCOUNTERSMAX + 4] = { 0 };
220+
float array_overheads[NCOUNTERSMAX + 4] = { 0 };
221+
for( int icounter = 1; icounter < NCOUNTERSMAX + 4; icounter++ ) // include icalibcounter = NCOUNTERSMAX+3
222222
{
223+
if( icounter == NCOUNTERSMAX + 1 ) continue;
224+
if( icounter == NCOUNTERSMAX + 2 ) continue;
223225
if( usechronotimers )
224226
array_totaltimes[icounter] = array_chronotimers[icounter].getTotalDurationSeconds();
225227
else
@@ -235,7 +237,7 @@ extern "C"
235237
// Remove overheads of included timers if any
236238
if( removetimeroverhead )
237239
{
238-
for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ )
240+
for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ ) // no need to include icalibcounter = NCOUNTERSMAX+3
239241
{
240242
for( int icounterIn : array_included[icounter] )
241243
array_totaltimes[icounter] -= array_overheads[icounterIn];
@@ -259,7 +261,7 @@ extern "C"
259261
array_tags[0] = "Fortran Other";
260262
array_counters[0] = 1;
261263
array_totaltimes[0] = program_totaltime;
262-
for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ )
264+
for( int icounter = 1; icounter < NCOUNTERSMAX + 4; icounter++ ) // include icalibcounter = NCOUNTERSMAX+3
263265
{
264266
if( !array_istesttimer[icounter] ) // skip TEST counters
265267
array_totaltimes[0] -= array_totaltimes[icounter];
@@ -280,7 +282,7 @@ extern "C"
280282
array_counters[NCOUNTERSMAX + 1] = 1;
281283
array_totaltimes[NCOUNTERSMAX + 1] = program_totaltime - array_totaltimes[NCOUNTERSMAX + 2];
282284
// Dump individual counters
283-
for( int icounter = 0; icounter < NCOUNTERSMAX + 3; icounter++ )
285+
for( int icounter = 0; icounter < NCOUNTERSMAX + 3; icounter++ ) // exclude icalibcounter = NCOUNTERSMAX+3 (would print a negative value here!)
284286
{
285287
if( array_tags[icounter] != "" )
286288
{

0 commit comments

Comments
 (0)