Skip to content

Commit 2c8d0dc

Browse files
benoitsteinerVijay Vasudevan
authored andcommitted
OpenCL Improvements (#7596)
* OpenCL improvements Added Tile, Transpose and Range Ops double support for SYCL device. Moved gpu_device_name() to test_util.py so now it can be used in force_gpu to pull either GPU or SYCL depending on what is available in the system. * Improvements to the SYCL device support - Registration of Type Traits required for stride slice op - Registration of ConcatOffset, _ListToArray, _ArrayToList Pad, Reverse ( CPU ), ReverseV2 ( CPU ), Size, ExpandDims, Squeeze, StridedSlice, StridedSliceGrad, StridedSliceAssign, TileGrad, InvertPermutation, Transpose - Registration of Sycl kernels only for essential data types - Floor_div_real has been disabled for SYCL device - Device in control_flow_ops_py_test.py needed to be lower cased * SYCL support improvements (#31) * Improvements to the SYCL device support This commit reduces number of failing tests when TensorFlow compiles for OpenCL support. - Registration of Type Traits required for stride slice op - Registration of ConcatOffset, _ListToArray, _ArrayToList Pad, Reverse ( CPU ), ReverseV2 ( CPU ), Size, ExpandDims, Squeeze, StridedSlice, StridedSliceGrad, StridedSliceAssign, TileGrad, InvertPermutation, Transpose - Registration of Sycl kernels only for essential data types - Floor_div_real has been disabled for SYCL device - Device in control_flow_ops_py_test.py needed to be lower cased * Fixes & Version bump (#33) * Fix Unbuntu typo. (#38) unbuntu -> ubuntu * Add problem descriptions and solutions (#35) * Add ComputeCpp lib folder to LD_LIBRARY_PATH * Add ImportError problem + solution If you get the error message "ImportError: libComputeCpp.so: cannot open shared object file: No such file or directory", make sure you have added the path to ComputeCpp's lib folder to your `LD_LIBRARY_PATH`. * Add another ImportError problem + solution If you get the error message "ImportError: cannot import name 'pywrap_tensorflow'" you may be standing in the TensorFlow directory. * Improvements to the SYCL device support * Registers FloorDiv, FloorMod and SoftMax Ops for SYCL device * Workaround for 0 bytes allocation for SYCL device (#42) * Sycl improvements (#44) - Eigen version bump - Extends Cast and Cwise ops benchmark to cover Sycl device - Extends device_lib_test.py to cover Sycl device - Registers int32, string and ResourceHandler to run on host for Enter and RefEnter Sycl Ops - Enables RecudeMax op for Sycl since Eigen implementation is ready - Registers Less op for Sycl device * Improved the formatting of the SYCL code * Fixed compilation error. * Made sure that using test sessions with force_gpu=True forces the placement on a gpu device even if none is detected.
1 parent 43c71a0 commit 2c8d0dc

102 files changed

Lines changed: 1451 additions & 221 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

tensorflow/core/BUILD

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ load(
116116
"//third_party/mkl:build_defs.bzl",
117117
"if_mkl",
118118
)
119+
load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
120+
119121
# -----------------------------------------------------------------------------
120122
# Public targets
121123

@@ -729,7 +731,7 @@ cc_library(
729731
"//tensorflow/core/kernels:ops_testutil",
730732
"//tensorflow/core/kernels:ops_util",
731733
"//tensorflow/core/platform/default/build_config:gtest",
732-
],
734+
] + if_sycl([":sycl_runtime"]),
733735
)
734736

735737
# This is a link-only library to provide a DirectSession

tensorflow/core/common_runtime/sycl/sycl_allocator.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ string SYCLAllocator::Name() { return "device:SYCL"; }
2525

2626
void *SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
2727
assert(device_);
28+
if (num_bytes == 0) {
29+
return device_->allocate(1);
30+
}
2831
auto p = device_->allocate(num_bytes);
2932
return p;
3033
}
@@ -42,6 +45,6 @@ void SYCLAllocator::EnterLameDuckMode() {
4245
}
4346
}
4447

45-
} // namespace tensorflow
48+
} // namespace tensorflow
4649

47-
#endif // TENSORFLOW_USE_SYCL
50+
#endif // TENSORFLOW_USE_SYCL

tensorflow/core/common_runtime/sycl/sycl_allocator.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,21 @@ limitations under the License.
2727
namespace tensorflow {
2828

2929
class SYCLAllocator : public Allocator {
30-
public:
31-
SYCLAllocator(Eigen::QueueInterface* device) : device_(device) {}
30+
public:
31+
SYCLAllocator(Eigen::QueueInterface *device) : device_(device) {}
3232
virtual ~SYCLAllocator() override;
3333
string Name() override;
3434
void *AllocateRaw(size_t alignment, size_t num_bytes) override;
3535
void DeallocateRaw(void *ptr) override;
3636

3737
void EnterLameDuckMode();
3838
virtual bool ShouldAllocateEmptyTensors() override final { return true; }
39-
private:
39+
40+
private:
4041
Eigen::QueueInterface *device_; // not owned
4142
TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator);
4243
};
4344

44-
} // namespace tensorflow
45+
} // namespace tensorflow
4546

46-
#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
47+
#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_

tensorflow/core/common_runtime/sycl/sycl_device.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ limitations under the License.
2323

2424
namespace tensorflow {
2525

26-
static std::unordered_set<SYCLDevice*> live_devices;
26+
static std::unordered_set<SYCLDevice *> live_devices;
2727
static bool first_time = true;
2828

2929
void ShutdownSycl() {

tensorflow/core/common_runtime/sycl/sycl_device.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,11 @@ class SYCLDevice : public LocalDevice {
3434
Bytes memory_limit, const DeviceLocality &locality,
3535
const string &physical_device_desc, SYCLSelector sycl_selector,
3636
Allocator *cpu_allocator)
37-
: LocalDevice(options, Device::BuildDeviceAttributes(
38-
name, DEVICE_SYCL, memory_limit, locality,
39-
physical_device_desc),
40-
nullptr),
37+
: LocalDevice(
38+
options,
39+
Device::BuildDeviceAttributes(name, DEVICE_SYCL, memory_limit,
40+
locality, physical_device_desc),
41+
nullptr),
4142
cpu_allocator_(cpu_allocator),
4243
sycl_queue_(new Eigen::QueueInterface(sycl_selector)),
4344
sycl_device_(new Eigen::SyclDevice(sycl_queue_)),

tensorflow/core/common_runtime/sycl/sycl_device_context.cc

Lines changed: 128 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ limitations under the License.
1717

1818
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
1919

20-
#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
2120
#include "tensorflow/core/common_runtime/dma_helper.h"
21+
#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
2222

2323
namespace tensorflow {
2424

@@ -31,68 +31,68 @@ void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor *cpu_tensor,
3131
const void *src_ptr = DMAHelper::base(cpu_tensor);
3232
void *dst_ptr = DMAHelper::base(device_tensor);
3333
switch (cpu_tensor->dtype()) {
34-
case DT_FLOAT:
35-
device->eigen_sycl_device()->memcpyHostToDevice(
36-
static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
37-
total_bytes);
38-
break;
39-
case DT_DOUBLE:
40-
device->eigen_sycl_device()->memcpyHostToDevice(
41-
static_cast<double *>(dst_ptr), static_cast<const double *>(src_ptr),
42-
total_bytes);
43-
break;
44-
case DT_INT32:
45-
device->eigen_sycl_device()->memcpyHostToDevice(
46-
static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
47-
total_bytes);
48-
break;
49-
case DT_INT64:
50-
device->eigen_sycl_device()->memcpyHostToDevice(
51-
static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
52-
total_bytes);
53-
break;
54-
case DT_HALF:
55-
device->eigen_sycl_device()->memcpyHostToDevice(
56-
static_cast<Eigen::half *>(dst_ptr),
57-
static_cast<const Eigen::half *>(src_ptr), total_bytes);
58-
break;
59-
case DT_COMPLEX64:
60-
device->eigen_sycl_device()->memcpyHostToDevice(
61-
static_cast<std::complex<float> *>(dst_ptr),
62-
static_cast<const std::complex<float> *>(src_ptr), total_bytes);
63-
break;
64-
case DT_COMPLEX128:
65-
device->eigen_sycl_device()->memcpyHostToDevice(
66-
static_cast<std::complex<double> *>(dst_ptr),
67-
static_cast<const std::complex<double> *>(src_ptr), total_bytes);
68-
break;
69-
case DT_INT8:
70-
device->eigen_sycl_device()->memcpyHostToDevice(
71-
static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
72-
total_bytes);
73-
break;
74-
case DT_INT16:
75-
device->eigen_sycl_device()->memcpyHostToDevice(
76-
static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
77-
total_bytes);
78-
break;
79-
case DT_UINT8:
80-
device->eigen_sycl_device()->memcpyHostToDevice(
81-
static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
82-
total_bytes);
83-
break;
84-
case DT_UINT16:
85-
device->eigen_sycl_device()->memcpyHostToDevice(
86-
static_cast<uint16 *>(dst_ptr), static_cast<const uint16 *>(src_ptr),
87-
total_bytes);
88-
break;
89-
case DT_BOOL:
90-
device->eigen_sycl_device()->memcpyHostToDevice(
91-
static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
92-
total_bytes);
93-
break;
94-
default:
95-
assert(false && "unsupported type");
34+
case DT_FLOAT:
35+
device->eigen_sycl_device()->memcpyHostToDevice(
36+
static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
37+
total_bytes);
38+
break;
39+
case DT_DOUBLE:
40+
device->eigen_sycl_device()->memcpyHostToDevice(
41+
static_cast<double *>(dst_ptr),
42+
static_cast<const double *>(src_ptr), total_bytes);
43+
break;
44+
case DT_INT32:
45+
device->eigen_sycl_device()->memcpyHostToDevice(
46+
static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
47+
total_bytes);
48+
break;
49+
case DT_INT64:
50+
device->eigen_sycl_device()->memcpyHostToDevice(
51+
static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
52+
total_bytes);
53+
break;
54+
case DT_HALF:
55+
device->eigen_sycl_device()->memcpyHostToDevice(
56+
static_cast<Eigen::half *>(dst_ptr),
57+
static_cast<const Eigen::half *>(src_ptr), total_bytes);
58+
break;
59+
case DT_COMPLEX64:
60+
device->eigen_sycl_device()->memcpyHostToDevice(
61+
static_cast<std::complex<float> *>(dst_ptr),
62+
static_cast<const std::complex<float> *>(src_ptr), total_bytes);
63+
break;
64+
case DT_COMPLEX128:
65+
device->eigen_sycl_device()->memcpyHostToDevice(
66+
static_cast<std::complex<double> *>(dst_ptr),
67+
static_cast<const std::complex<double> *>(src_ptr), total_bytes);
68+
break;
69+
case DT_INT8:
70+
device->eigen_sycl_device()->memcpyHostToDevice(
71+
static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
72+
total_bytes);
73+
break;
74+
case DT_INT16:
75+
device->eigen_sycl_device()->memcpyHostToDevice(
76+
static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
77+
total_bytes);
78+
break;
79+
case DT_UINT8:
80+
device->eigen_sycl_device()->memcpyHostToDevice(
81+
static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
82+
total_bytes);
83+
break;
84+
case DT_UINT16:
85+
device->eigen_sycl_device()->memcpyHostToDevice(
86+
static_cast<uint16 *>(dst_ptr),
87+
static_cast<const uint16 *>(src_ptr), total_bytes);
88+
break;
89+
case DT_BOOL:
90+
device->eigen_sycl_device()->memcpyHostToDevice(
91+
static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
92+
total_bytes);
93+
break;
94+
default:
95+
assert(false && "unsupported type");
9696
}
9797
}
9898
device->eigen_sycl_device()->synchronize();
@@ -106,76 +106,76 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
106106
StatusCallback done) {
107107
const int64 total_bytes = device_tensor->TotalBytes();
108108
if (total_bytes > 0) {
109-
const void* src_ptr = DMAHelper::base(device_tensor);
110-
void* dst_ptr = DMAHelper::base(cpu_tensor);
109+
const void *src_ptr = DMAHelper::base(device_tensor);
110+
void *dst_ptr = DMAHelper::base(cpu_tensor);
111111
switch (device_tensor->dtype()) {
112-
case DT_FLOAT:
113-
device->eigen_sycl_device()->memcpyDeviceToHost(
114-
static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
115-
total_bytes);
116-
break;
117-
case DT_DOUBLE:
118-
device->eigen_sycl_device()->memcpyDeviceToHost(
119-
static_cast<double *>(dst_ptr), static_cast<const double *>(src_ptr),
120-
total_bytes);
121-
break;
122-
case DT_INT32:
123-
device->eigen_sycl_device()->memcpyDeviceToHost(
124-
static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
125-
total_bytes);
126-
break;
127-
case DT_INT64:
128-
device->eigen_sycl_device()->memcpyDeviceToHost(
129-
static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
130-
total_bytes);
131-
break;
132-
case DT_HALF:
133-
device->eigen_sycl_device()->memcpyDeviceToHost(
134-
static_cast<Eigen::half *>(dst_ptr),
135-
static_cast<const Eigen::half *>(src_ptr), total_bytes);
136-
break;
137-
case DT_COMPLEX64:
138-
device->eigen_sycl_device()->memcpyDeviceToHost(
139-
static_cast<std::complex<float> *>(dst_ptr),
140-
static_cast<const std::complex<float> *>(src_ptr), total_bytes);
141-
break;
142-
case DT_COMPLEX128:
143-
device->eigen_sycl_device()->memcpyDeviceToHost(
144-
static_cast<std::complex<double> *>(dst_ptr),
145-
static_cast<const std::complex<double> *>(src_ptr), total_bytes);
146-
break;
147-
case DT_INT8:
148-
device->eigen_sycl_device()->memcpyDeviceToHost(
149-
static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
150-
total_bytes);
151-
break;
152-
case DT_INT16:
153-
device->eigen_sycl_device()->memcpyDeviceToHost(
154-
static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
155-
total_bytes);
156-
break;
157-
case DT_UINT8:
158-
device->eigen_sycl_device()->memcpyDeviceToHost(
159-
static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
160-
total_bytes);
161-
break;
162-
case DT_UINT16:
163-
device->eigen_sycl_device()->memcpyDeviceToHost(
164-
static_cast<uint16 *>(dst_ptr), static_cast<const uint16 *>(src_ptr),
165-
total_bytes);
166-
break;
167-
case DT_BOOL:
168-
device->eigen_sycl_device()->memcpyDeviceToHost(
169-
static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
170-
total_bytes);
171-
break;
172-
default:
173-
assert(false && "unsupported type");
112+
case DT_FLOAT:
113+
device->eigen_sycl_device()->memcpyDeviceToHost(
114+
static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
115+
total_bytes);
116+
break;
117+
case DT_DOUBLE:
118+
device->eigen_sycl_device()->memcpyDeviceToHost(
119+
static_cast<double *>(dst_ptr),
120+
static_cast<const double *>(src_ptr), total_bytes);
121+
break;
122+
case DT_INT32:
123+
device->eigen_sycl_device()->memcpyDeviceToHost(
124+
static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
125+
total_bytes);
126+
break;
127+
case DT_INT64:
128+
device->eigen_sycl_device()->memcpyDeviceToHost(
129+
static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
130+
total_bytes);
131+
break;
132+
case DT_HALF:
133+
device->eigen_sycl_device()->memcpyDeviceToHost(
134+
static_cast<Eigen::half *>(dst_ptr),
135+
static_cast<const Eigen::half *>(src_ptr), total_bytes);
136+
break;
137+
case DT_COMPLEX64:
138+
device->eigen_sycl_device()->memcpyDeviceToHost(
139+
static_cast<std::complex<float> *>(dst_ptr),
140+
static_cast<const std::complex<float> *>(src_ptr), total_bytes);
141+
break;
142+
case DT_COMPLEX128:
143+
device->eigen_sycl_device()->memcpyDeviceToHost(
144+
static_cast<std::complex<double> *>(dst_ptr),
145+
static_cast<const std::complex<double> *>(src_ptr), total_bytes);
146+
break;
147+
case DT_INT8:
148+
device->eigen_sycl_device()->memcpyDeviceToHost(
149+
static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
150+
total_bytes);
151+
break;
152+
case DT_INT16:
153+
device->eigen_sycl_device()->memcpyDeviceToHost(
154+
static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
155+
total_bytes);
156+
break;
157+
case DT_UINT8:
158+
device->eigen_sycl_device()->memcpyDeviceToHost(
159+
static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
160+
total_bytes);
161+
break;
162+
case DT_UINT16:
163+
device->eigen_sycl_device()->memcpyDeviceToHost(
164+
static_cast<uint16 *>(dst_ptr),
165+
static_cast<const uint16 *>(src_ptr), total_bytes);
166+
break;
167+
case DT_BOOL:
168+
device->eigen_sycl_device()->memcpyDeviceToHost(
169+
static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
170+
total_bytes);
171+
break;
172+
default:
173+
assert(false && "unsupported type");
174174
}
175175
}
176176
device->eigen_sycl_device()->synchronize();
177177
done(Status::OK());
178178
}
179179

180180
} // namespace tensorflow
181-
#endif // TENSORFLOW_USE_SYCL
181+
#endif // TENSORFLOW_USE_SYCL

tensorflow/core/common_runtime/sycl/sycl_device_context.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ limitations under the License.
2626
namespace tensorflow {
2727

2828
class SYCLDeviceContext : public DeviceContext {
29-
public:
29+
public:
3030
SYCLDeviceContext() {}
3131

3232
~SYCLDeviceContext() override {}
@@ -40,6 +40,6 @@ class SYCLDeviceContext : public DeviceContext {
4040
StatusCallback done) override;
4141
};
4242

43-
} // namespace tensorflow
43+
} // namespace tensorflow
4444

45-
#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
45+
#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_

0 commit comments

Comments
 (0)