AliceO2/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx at 1a4806d96c5ec4a053deda3f2d89611834b65234 · davidrohr/AliceO2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
// All rights not expressly granted are reserved.
//
// This software is distributed under the terms of the GNU General Public
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
//
// In applying this license CERN does not waive the privileges and immunities
// granted to it by virtue of its status as an Intergovernmental Organization
// or submit itself to any jurisdiction.

/// \file GPUReconstructionDeviceBase.cxx
/// \author David Rohr

#include "GPUReconstructionDeviceBase.h"
#include "GPUReconstructionIncludes.h"

#include "GPUTPCTracker.h"

using namespace o2::gpu;

#if defined(_WIN32)
#include "../utils/pthread_mutex_win32_wrapper.h"
#else
#include <cerrno>
#include <unistd.h>
#endif
#include <cstring>

class GPUTPCRow;

#define SemLockName "AliceHLTTPCGPUTrackerInitLockSem"

GPUReconstructionDeviceBase::GPUReconstructionDeviceBase(const GPUSettingsDeviceBackend& cfg, size_t sizeCheck) : GPUReconstructionCPU(cfg)
{
  if (sizeCheck != sizeof(GPUReconstructionDeviceBase)) {
    GPUFatal("Mismatch of C++ object size between GPU compilers!");
  }
}

GPUReconstructionDeviceBase::~GPUReconstructionDeviceBase() = default;

int32_t GPUReconstructionDeviceBase::GetGlobalLock(void*& pLock)
{
#ifdef _WIN32
  HANDLE* semLock = new HANDLE;
  *semLock = CreateSemaphore(nullptr, 1, 1, SemLockName);
  if (*semLock == nullptr) {
    GPUError("Error creating GPUInit Semaphore");
    return (1);
  }
  WaitForSingleObject(*semLock, INFINITE);
#elif !defined(__APPLE__) // GPU not supported on MacOS anyway
  sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
  if (semLock == SEM_FAILED) {
    GPUError("Error creating GPUInit Semaphore");
    return (1);
  }
  timespec semtime;
  clock_gettime(CLOCK_REALTIME, &semtime);
  semtime.tv_sec += 10;
  while (sem_timedwait(semLock, &semtime) != 0) {
    GPUError("Global Lock for GPU initialisation was not released for 10 seconds, assuming another thread died");
    GPUWarning("Resetting the global lock");
    sem_post(semLock);
  }
#else
  void* semLock = nullptr;
#endif
  pLock = semLock;
  return 0;
}

void GPUReconstructionDeviceBase::ReleaseGlobalLock(void* sem)
{
// Release the global named semaphore that locks GPU Initialization
#ifdef _WIN32
  HANDLE* h = (HANDLE*)sem;
  ReleaseSemaphore(*h, 1, nullptr);
  CloseHandle(*h);
  delete h;
#elif !defined(__APPLE__) // GPU not supported on MacOS anyway
  sem_t* pSem = (sem_t*)sem;
  sem_post(pSem);
  sem_unlink(SemLockName);
#endif
}

int32_t GPUReconstructionDeviceBase::InitDevice()
{
  // cpu_set_t mask;
  // CPU_ZERO(&mask);
  // CPU_SET(0, &mask);
  // sched_setaffinity(0, sizeof(mask), &mask);

  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
    GPUError("Individual memory allocation strategy unsupported for device\n");
    return (1);
  }
  if (mProcessingSettings.nStreams > GPUCA_MAX_STREAMS) {
    GPUError("Too many straems requested %d > %d\n", mProcessingSettings.nStreams, GPUCA_MAX_STREAMS);
    return (1);
  }

  void* semLock = nullptr;
  if (mProcessingSettings.globalInitMutex && GetGlobalLock(semLock)) {
    return (1);
  }

  if (mProcessingSettings.deviceTimers) {
    AddGPUEvents(mDebugEvents);
  }

  int32_t retVal = InitDevice_Runtime();
  if (retVal) {
    GPUImportant("GPU Tracker initialization failed");
    return (1);
  }

  if (mProcessingSettings.globalInitMutex) {
    ReleaseGlobalLock(semLock);
  }

  mDeviceMemoryPermanent = mDeviceMemoryBase;
  mHostMemoryPermanent = mHostMemoryBase;
  ClearAllocatedMemory();

  mProcShadow.InitGPUProcessor(this, GPUProcessor::PROCESSOR_TYPE_SLAVE);
  mProcShadow.mMemoryResProcessors = RegisterMemoryAllocation(&mProcShadow, &GPUProcessorProcessors::SetPointersDeviceProcessor, GPUMemoryResource::MEMORY_PERMANENT | GPUMemoryResource::MEMORY_HOST, "Processors");
  AllocateRegisteredMemory(mProcShadow.mMemoryResProcessors);

  if (mMaster == nullptr || mProcessingSettings.debugLevel >= 2) {
    GPUInfo("GPU Tracker initialization successfull"); // Verbosity reduced because GPU backend will print GPUImportant message!
  }

  return (retVal);
}

void* GPUReconstructionDeviceBase::GPUProcessorProcessors::SetPointersDeviceProcessor(void* mem)
{
  // Don't run constructor / destructor here, this will be just local memcopy of Processors in GPU Memory
  computePointerWithAlignment(mem, mProcessorsProc, 1);
  return mem;
}

int32_t GPUReconstructionDeviceBase::ExitDevice()
{
  int32_t retVal = ExitDevice_Runtime();
  mProcessorsShadow = nullptr;
  mHostMemoryPool = mHostMemoryBase = mDeviceMemoryPool = mDeviceMemoryBase = mHostMemoryPoolEnd = mDeviceMemoryPoolEnd = mHostMemoryPermanent = mDeviceMemoryPermanent = nullptr;
  mHostMemorySize = mDeviceMemorySize = 0;

  return retVal;
}

int32_t GPUReconstructionDeviceBase::registerMemoryForGPU_internal(const void* ptr, size_t size)
{
  return IsGPU();
}

int32_t GPUReconstructionDeviceBase::unregisterMemoryForGPU_internal(const void* ptr)
{
  return IsGPU();
}

void GPUReconstructionDeviceBase::unregisterRemainingRegisteredMemory()
{
  for (auto& ptr : mRegisteredMemoryPtrs) {
    unregisterMemoryForGPU_internal(ptr);
  }
  mRegisteredMemoryPtrs.clear();
}

void GPUReconstructionDeviceBase::runConstantRegistrators()
{
  auto& list = getDeviceConstantMemRegistratorsVector();
  for (uint32_t i = 0; i < list.size(); i++) {
    auto* ptr = list[i]();
    if (ptr == nullptr) {
      GPUFatal("Error registering constant memory");
    }
    mDeviceConstantMemList.emplace_back(ptr);
  }
}

size_t GPUReconstructionDeviceBase::TransferMemoryInternal(GPUMemoryResource* res, int32_t stream, deviceEvent* ev, deviceEvent* evList, int32_t nEvents, bool toGPU, const void* src, void* dst)
{
  if (!(res->Type() & GPUMemoryResource::MEMORY_GPU)) {
    if (mProcessingSettings.debugLevel >= 4) {
      GPUInfo("Skipped transfer of non-GPU memory resource: %s", res->Name());
    }
    return 0;
  }
  if (mProcessingSettings.debugLevel >= 3 && (strcmp(res->Name(), "ErrorCodes") || mProcessingSettings.debugLevel >= 4)) {
    GPUInfo("Copying to %s: %s - %ld bytes", toGPU ? "GPU" : "Host", res->Name(), (int64_t)res->Size());
  }
  return GPUMemCpy(dst, src, res->Size(), stream, toGPU, ev, evList, nEvents);
}