|
21 | 21 | #include "./tensor_math_cpp.h" |
22 | 22 | #include "./tensor_math_cuda.h" |
23 | 23 | #include "./tensor_math_opencl.h" |
| 24 | + |
24 | 25 | #include <utility> |
25 | 26 | #include <algorithm> |
26 | 27 |
|
| 28 | +//#include <tc/lang/error_report.h> |
| 29 | +//#include <tc/core/compiler.h> |
| 30 | +#include "tc/core/check.h" |
| 31 | +#include "tc/core/compiler.h" |
| 32 | +#include "tc/core/tc_executor.h" |
| 33 | +#include "tc/core/tensor.h" |
27 | 34 |
|
28 | 35 | #define Noaxis 9999 |
29 | 36 |
|
| 37 | +// namespace is already exist in singa |
| 38 | +// aliasing to avoid duplicates |
| 39 | +namespace tclang = lang; |
| 40 | + |
30 | 41 | namespace singa { |
31 | 42 |
|
32 | 43 | Tensor::~Tensor() { |
@@ -1334,4 +1345,158 @@ Tensor Reshape(const Tensor &in, const Shape &s) { |
1334 | 1345 | return out.Reshape(s); |
1335 | 1346 | } |
1336 | 1347 |
|
| 1348 | + |
| 1349 | +/// tc integration start |
| 1350 | +struct SingaDLManagedTensor { |
| 1351 | + Tensor handle; |
| 1352 | + DLManagedTensor tensor; |
| 1353 | +}; |
| 1354 | + |
| 1355 | +void deleter(DLManagedTensor* arg) { |
| 1356 | + delete static_cast<SingaDLManagedTensor*>(arg->manager_ctx); |
| 1357 | +} |
| 1358 | + |
| 1359 | +static DLDataType getDLDataType(const Tensor& t) { |
| 1360 | + DLDataType dtype; |
| 1361 | + dtype.lanes = 1; |
| 1362 | + // TODO: get the number of bytes of the datatype |
| 1363 | + //dtype.bits = t.data_type() * 8; |
| 1364 | + dtype.bits = 4 * 8; |
| 1365 | + switch (t.data_type()) { |
| 1366 | + case kFloat32: |
| 1367 | + dtype.code = DLDataTypeCode::kDLFloat; |
| 1368 | + break; |
| 1369 | + default: |
| 1370 | + throw std::logic_error("only kFloat32 is supported for dlpack conversion"); |
| 1371 | + break; |
| 1372 | + } |
| 1373 | + return dtype; |
| 1374 | +} |
| 1375 | + |
| 1376 | +static DLContext getDLContext(const Tensor& tensor, const int64_t& device_id) { |
| 1377 | + DLContext ctx; |
| 1378 | + ctx.device_id = device_id; |
| 1379 | + ctx.device_type = DLDeviceType::kDLGPU; |
| 1380 | + //TODO: fix this |
| 1381 | + //if (tensor.is_cuda()) { |
| 1382 | + // ctx.device_type = DLDeviceType::kDLGPU; |
| 1383 | + //} else { |
| 1384 | + // ctx.device_type = DLDeviceType::kDLCPU; |
| 1385 | + //} |
| 1386 | + return ctx; |
| 1387 | +} |
| 1388 | + |
| 1389 | +// This function returns a shared_ptr to memory managed DLpack tensor |
| 1390 | +// constructed out of ATen tensor |
| 1391 | +DLManagedTensor* toDLPack(const Tensor& src) { |
| 1392 | + SingaDLManagedTensor* singaDLManagedTensor(new SingaDLManagedTensor); |
| 1393 | + singaDLManagedTensor->handle = src; |
| 1394 | + singaDLManagedTensor->tensor.manager_ctx = singaDLManagedTensor; |
| 1395 | + singaDLManagedTensor->tensor.deleter = &deleter; |
| 1396 | + singaDLManagedTensor->tensor.dl_tensor.data = src.block()->mutable_data(); |
| 1397 | + int64_t device_id = 0; |
| 1398 | + // TODO: fix this |
| 1399 | + //if (src.is_cuda()) { |
| 1400 | + // device_id = src.get_device(); |
| 1401 | + //} |
| 1402 | + singaDLManagedTensor->tensor.dl_tensor.ctx = getDLContext(src, device_id); |
| 1403 | + singaDLManagedTensor->tensor.dl_tensor.ndim = src.nDim(); |
| 1404 | + singaDLManagedTensor->tensor.dl_tensor.dtype = getDLDataType(src); |
| 1405 | + |
| 1406 | + auto shapeVec = new std::vector<int64_t>(src.shape().begin(),src.shape().end()); |
| 1407 | + singaDLManagedTensor->tensor.dl_tensor.shape = shapeVec->data(); |
| 1408 | + |
| 1409 | + auto strideVec = new std::vector<int64_t>(src.stride().begin(),src.stride().end()); |
| 1410 | + singaDLManagedTensor->tensor.dl_tensor.strides = strideVec->data(); |
| 1411 | + |
| 1412 | + singaDLManagedTensor->tensor.dl_tensor.byte_offset = 0; |
| 1413 | + return &(singaDLManagedTensor->tensor); |
| 1414 | +} |
| 1415 | + |
| 1416 | +// prepare output |
| 1417 | +std::vector<tc::DLTensorUPtr> inferOutputTensorInfo( |
| 1418 | + const std::string& tc, |
| 1419 | + const std::string& entryPoint, |
| 1420 | + const std::vector<Tensor>& inputs) { |
| 1421 | + auto parsedTcs = tc::detail::parse(tc); |
| 1422 | + if (parsedTcs.count(entryPoint) != 1u) { |
| 1423 | + TC_CHECK_GE(parsedTcs.size(), 1u) |
| 1424 | + << "No TC was parsed, should have thrown earlier"; |
| 1425 | + throw tclang::ErrorReport(parsedTcs.begin()->second) |
| 1426 | + << "\nattempting to access undefined entryPoint: " << entryPoint; |
| 1427 | + } |
| 1428 | + auto inputDLTensors = makeDLConstTensors(inputs); |
| 1429 | + return makeDLTensorVector(tc::detail::inferOutputTensorInfo(parsedTcs.at(entryPoint), extractRawPtrs(inputDLTensors))); |
| 1430 | +} |
| 1431 | + |
| 1432 | +std::vector<Tensor> prepareOutputs( |
| 1433 | + const std::string& tc, |
| 1434 | + const std::string& entryPoint, |
| 1435 | + const std::vector<Tensor>& inputs) { |
| 1436 | + std::vector<Tensor> outputs; |
| 1437 | + auto outTensorInfo = inferOutputTensorInfo(tc, entryPoint, inputs); |
| 1438 | + if (outTensorInfo.size() == 0) { |
| 1439 | + return outputs; |
| 1440 | + } |
| 1441 | + TC_CHECK_GE(inputs.size(), 1u) |
| 1442 | + << "NYI: Need >= 1 input tensors to determine " |
| 1443 | + << "backend and prepare ATen outputs. Add an overload with just an ATen " |
| 1444 | + << "backend"; |
| 1445 | + |
| 1446 | + auto dev = inputs[0].device(); |
| 1447 | + auto dtype = inputs[0].data_type(); |
| 1448 | + for (size_t i = 0; i < outTensorInfo.size(); ++i) { |
| 1449 | + tc::TensorInfo info(outTensorInfo[i]); |
| 1450 | + Shape shape(info.shape.begin(), info.shape.end()); |
| 1451 | + |
| 1452 | + Tensor tmp(shape, dev, dtype); |
| 1453 | + outputs.push_back(tmp); |
| 1454 | + } |
| 1455 | + return outputs; |
| 1456 | +} |
| 1457 | + |
| 1458 | + |
| 1459 | +// examples of TC operations |
| 1460 | +Tensor SoftMaxTC(const Tensor &in) { |
| 1461 | + std::string tc= R"TC( |
| 1462 | +def softmax(float(N, D) I) -> (O, expsum) { |
| 1463 | + expsum(n) +=! exp(I(n, d)) |
| 1464 | + O(n, d) = exp(I(n, d)) / expsum(n) |
| 1465 | +} |
| 1466 | +)TC"; |
| 1467 | + auto naiveOptions = tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions(); |
| 1468 | + auto pExecutor = singa::compileTC<tc::CudaBackend>(tc, "softmax", {in}, {naiveOptions}); |
| 1469 | + auto outputs = singa::prepareOutputs(tc, "softmax", {in}); |
| 1470 | + singa::runTC(*pExecutor, {in}, outputs); |
| 1471 | + return outputs[0]; |
| 1472 | +} |
| 1473 | + |
| 1474 | +Tensor ReluTC(const Tensor &in) { |
| 1475 | + std::string tc = R"TC( |
| 1476 | +def relu(float(B,M) I) -> (O1){ |
| 1477 | + O1(b, m) = fmax(I(b, m), 0) |
| 1478 | +} |
| 1479 | + )TC"; |
| 1480 | + auto naiveOptions = tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions(); |
| 1481 | + auto pExecutor = singa::compileTC<tc::CudaBackend>(tc, "relu", {in}, {naiveOptions}); |
| 1482 | + auto outputs = singa::prepareOutputs(tc, "relu", {in}); |
| 1483 | + singa::runTC(*pExecutor, {in}, outputs); |
| 1484 | + return outputs[0]; |
| 1485 | +} |
| 1486 | + |
| 1487 | +Tensor MatMulTC(const Tensor &in1,const Tensor &in2) { |
| 1488 | + std::string tc = R"TC( |
| 1489 | +def matmul(float(M,N) A, float(N,K) B) -> (output) { |
| 1490 | + output(i, j) +=! A(i, kk) * B(kk, j) |
| 1491 | +} |
| 1492 | + )TC"; |
| 1493 | + auto naiveOptions = tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions(); |
| 1494 | + auto pExecutor = singa::compileTC<tc::CudaBackend>(tc, "matmul", {in1, in2}, {naiveOptions}); |
| 1495 | + auto outputs = singa::prepareOutputs(tc, "matmul", {in1, in2}); |
| 1496 | + singa::runTC(*pExecutor, {in1, in2}, outputs); |
| 1497 | + return outputs[0]; |
| 1498 | +} |
| 1499 | +/// tc integration end |
| 1500 | + |
| 1501 | + |
1337 | 1502 | } // namespace singa |
0 commit comments