@@ -1066,19 +1066,19 @@ void computeCellsHandler(
10661066 nSigmaCut); // const float
10671067}
10681068
1069- void countCellNeighboursHandler (CellSeed** cellsLayersDevice,
1070- int * neighboursLUT,
1071- int ** cellsLUTs,
1072- gpuPair<int , int >* cellNeighbours,
1073- int * neighboursIndexTable,
1074- const float maxChi2ClusterAttachment,
1075- const float bz,
1076- const int layerIndex,
1077- const unsigned int nCells,
1078- const unsigned int nCellsNext,
1079- const int maxCellNeighbours,
1080- const int nBlocks,
1081- const int nThreads)
1069+ unsigned int countCellNeighboursHandler (CellSeed** cellsLayersDevice,
1070+ int * neighboursLUT,
1071+ int ** cellsLUTs,
1072+ gpuPair<int , int >* cellNeighbours,
1073+ int * neighboursIndexTable,
1074+ const float maxChi2ClusterAttachment,
1075+ const float bz,
1076+ const int layerIndex,
1077+ const unsigned int nCells,
1078+ const unsigned int nCellsNext,
1079+ const int maxCellNeighbours,
1080+ const int nBlocks,
1081+ const int nThreads)
10821082{
10831083 gpu::computeLayerCellNeighboursKernel<true ><<<nBlocks, nThreads>>> (
10841084 cellsLayersDevice,
@@ -1091,8 +1091,7 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
10911091 layerIndex,
10921092 nCells,
10931093 maxCellNeighbours);
1094- // gpuCheckError(cudaPeekAtLastError());
1095- // gpuCheckError(cudaDeviceSynchronize());
1094+
10961095 void *d_temp_storage = nullptr , *d_temp_storage_2 = nullptr ;
10971096 size_t temp_storage_bytes = 0 , temp_storage_bytes_2 = 0 ;
10981097 gpuCheckError (cub::DeviceScan::InclusiveSum (d_temp_storage, // d_temp_storage
@@ -1102,28 +1101,31 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
11021101 nCellsNext)); // num_items
11031102
11041103 discardResult (cudaMalloc (&d_temp_storage, temp_storage_bytes));
1105- gpuCheckError (cub::DeviceScan::InclusiveSum (d_temp_storage, // d_temp_storage
1106- temp_storage_bytes, // temp_storage_bytes
1107- neighboursLUT, // d_in
1108- neighboursLUT, // d_out
1109- nCellsNext)); // num_items
1104+ gpuCheckError (cub::DeviceScan::InclusiveSum (d_temp_storage, // d_temp_storage
1105+ temp_storage_bytes, // temp_storage_bytes
1106+ neighboursLUT, // d_in
1107+ neighboursLUT, // d_out
1108+ nCellsNext)); // num_items
1109+
11101110 gpuCheckError (cub::DeviceScan::ExclusiveSum (d_temp_storage_2, // d_temp_storage
11111111 temp_storage_bytes_2, // temp_storage_bytes
11121112 neighboursIndexTable, // d_in
11131113 neighboursIndexTable, // d_out
11141114 nCells + 1 , // num_items
11151115 0 )); // NOLINT: this is the offset of the sum, not a pointer
1116+
11161117 discardResult (cudaMalloc (&d_temp_storage_2, temp_storage_bytes_2));
11171118 gpuCheckError (cub::DeviceScan::ExclusiveSum (d_temp_storage_2, // d_temp_storage
11181119 temp_storage_bytes_2, // temp_storage_bytes
11191120 neighboursIndexTable, // d_in
11201121 neighboursIndexTable, // d_out
11211122 nCells + 1 , // num_items
11221123 0 )); // NOLINT: this is the offset of the sum, not a pointer
1124+ unsigned int nNeighbours;
1125+ gpuCheckError (cudaMemcpy (&nNeighbours, &neighboursLUT[nCellsNext - 1 ], sizeof (unsigned int ), cudaMemcpyDeviceToHost));
11231126 gpuCheckError (cudaFree (d_temp_storage));
11241127 gpuCheckError (cudaFree (d_temp_storage_2));
1125- gpuCheckError (cudaPeekAtLastError ());
1126- gpuCheckError (cudaDeviceSynchronize ());
1128+ return nNeighbours;
11271129}
11281130
11291131void computeCellNeighboursHandler (CellSeed** cellsLayersDevice,
0 commit comments