Skip to content

Commit 096a226

Browse files
authored
Merge pull request #46 from johnh2o2/feature/sparse-bls-gpu
Feature/sparse bls gpu
2 parents 4b22075 + 1b200f4 commit 096a226

6 files changed

Lines changed: 829 additions & 4 deletions

File tree

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
strategy:
1313
fail-fast: false
1414
matrix:
15-
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
15+
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
1616

1717
steps:
1818
- uses: actions/checkout@v3

cuvarbase/bls.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1150,6 +1150,139 @@ def sparse_bls_cpu(t, y, dy, freqs, ignore_negative_delta_sols=False):
11501150
return bls_powers, solutions
11511151

11521152

1153+
def compile_sparse_bls(block_size=_default_block_size, use_simple=True, **kwargs):
1154+
"""
1155+
Compile sparse BLS GPU kernel
1156+
1157+
Parameters
1158+
----------
1159+
block_size: int, optional (default: _default_block_size)
1160+
CUDA threads per CUDA block.
1161+
use_simple: bool, optional (default: True)
1162+
Use simplified kernel (more reliable, slightly slower)
1163+
1164+
Returns
1165+
-------
1166+
kernel: PyCUDA function
1167+
The compiled sparse_bls_kernel function
1168+
"""
1169+
# Read kernel - use simple version by default (it works!)
1170+
kernel_name = 'sparse_bls_simple' if use_simple else 'sparse_bls'
1171+
cppd = dict(BLOCK_SIZE=block_size)
1172+
kernel_txt = _module_reader(find_kernel(kernel_name),
1173+
cpp_defs=cppd)
1174+
1175+
# compile kernel
1176+
module = SourceModule(kernel_txt, options=['--use_fast_math'])
1177+
1178+
func_name = 'sparse_bls_kernel_simple' if use_simple else 'sparse_bls_kernel'
1179+
kernel = module.get_function(func_name)
1180+
1181+
# Don't use prepare() - it causes issues with large shared memory
1182+
return kernel
1183+
1184+
1185+
def sparse_bls_gpu(t, y, dy, freqs, ignore_negative_delta_sols=False,
1186+
block_size=64, max_ndata=None,
1187+
stream=None, kernel=None):
1188+
"""
1189+
GPU-accelerated sparse BLS implementation.
1190+
1191+
Uses a CUDA kernel to test all pairs of observations as potential
1192+
transit boundaries. More efficient than CPU implementation for datasets
1193+
with ~100-1000 observations.
1194+
1195+
Based on https://arxiv.org/abs/2103.06193
1196+
1197+
Parameters
1198+
----------
1199+
t: array_like, float
1200+
Observation times
1201+
y: array_like, float
1202+
Observations
1203+
dy: array_like, float
1204+
Observation uncertainties
1205+
freqs: array_like, float
1206+
Frequencies to test
1207+
ignore_negative_delta_sols: bool, optional (default: False)
1208+
Whether or not to ignore solutions with negative delta (inverted dips)
1209+
block_size: int, optional (default: 64)
1210+
CUDA threads per CUDA block (use 32-128 for best performance)
1211+
max_ndata: int, optional (default: None)
1212+
Maximum number of data points (for shared memory allocation).
1213+
If None, uses len(t)
1214+
stream: pycuda.driver.Stream, optional (default: None)
1215+
CUDA stream for async execution
1216+
kernel: PyCUDA function, optional (default: None)
1217+
Pre-compiled kernel. If None, compiles kernel automatically.
1218+
1219+
Returns
1220+
-------
1221+
bls_powers: array_like, float
1222+
BLS power at each frequency
1223+
solutions: list of (q, phi0) tuples
1224+
Best (q, phi0) solution at each frequency
1225+
"""
1226+
# Convert to numpy arrays
1227+
t = np.asarray(t).astype(np.float32)
1228+
y = np.asarray(y).astype(np.float32)
1229+
dy = np.asarray(dy).astype(np.float32)
1230+
freqs = np.asarray(freqs).astype(np.float32)
1231+
1232+
ndata = len(t)
1233+
nfreqs = len(freqs)
1234+
1235+
if max_ndata is None:
1236+
max_ndata = ndata
1237+
1238+
# Compile kernel if not provided
1239+
if kernel is None:
1240+
kernel = compile_sparse_bls(block_size=block_size)
1241+
1242+
# Allocate GPU memory
1243+
t_g = gpuarray.to_gpu(t)
1244+
y_g = gpuarray.to_gpu(y)
1245+
dy_g = gpuarray.to_gpu(dy)
1246+
freqs_g = gpuarray.to_gpu(freqs)
1247+
1248+
bls_powers_g = gpuarray.zeros(nfreqs, dtype=np.float32)
1249+
best_q_g = gpuarray.zeros(nfreqs, dtype=np.float32)
1250+
best_phi_g = gpuarray.zeros(nfreqs, dtype=np.float32)
1251+
1252+
# Calculate shared memory size
1253+
# Simple kernel needs: 3 data arrays (phi, y, w) + 1 temp array for reductions
1254+
# Allocate for blockDim from function parameter (block_size) to be safe
1255+
shared_mem_size = (3 * max_ndata + block_size) * 4
1256+
1257+
# Launch kernel
1258+
# Grid: one block per frequency (or fewer if limited by hardware)
1259+
max_blocks = 65535 # CUDA maximum
1260+
grid = (min(nfreqs, max_blocks), 1)
1261+
block = (block_size, 1, 1)
1262+
1263+
if stream is None:
1264+
stream = cuda.Stream()
1265+
1266+
# Call kernel without prepare() to avoid resource issues
1267+
kernel(
1268+
t_g, y_g, dy_g, freqs_g,
1269+
np.uint32(ndata), np.uint32(nfreqs),
1270+
np.uint32(ignore_negative_delta_sols),
1271+
bls_powers_g, best_q_g, best_phi_g,
1272+
block=block, grid=grid, stream=stream,
1273+
shared=shared_mem_size
1274+
)
1275+
1276+
# Copy results back
1277+
stream.synchronize()
1278+
bls_powers = bls_powers_g.get()
1279+
best_q = best_q_g.get()
1280+
best_phi = best_phi_g.get()
1281+
1282+
solutions = list(zip(best_q, best_phi))
1283+
return bls_powers, solutions
1284+
1285+
11531286
def eebls_transit(t, y, dy, fmax_frac=1.0, fmin_frac=1.0,
11541287
qmin_fac=0.5, qmax_fac=2.0, fmin=None,
11551288
fmax=None, freqs=None, qvals=None, use_fast=False,

0 commit comments

Comments
 (0)