@@ -1150,6 +1150,139 @@ def sparse_bls_cpu(t, y, dy, freqs, ignore_negative_delta_sols=False):
11501150 return bls_powers , solutions
11511151
11521152
1153+ def compile_sparse_bls (block_size = _default_block_size , use_simple = True , ** kwargs ):
1154+ """
1155+ Compile sparse BLS GPU kernel
1156+
1157+ Parameters
1158+ ----------
1159+ block_size: int, optional (default: _default_block_size)
1160+ CUDA threads per CUDA block.
1161+ use_simple: bool, optional (default: True)
1162+ Use simplified kernel (more reliable, slightly slower)
1163+
1164+ Returns
1165+ -------
1166+ kernel: PyCUDA function
1167+ The compiled sparse_bls_kernel function
1168+ """
1169+ # Read kernel - use simple version by default (it works!)
1170+ kernel_name = 'sparse_bls_simple' if use_simple else 'sparse_bls'
1171+ cppd = dict (BLOCK_SIZE = block_size )
1172+ kernel_txt = _module_reader (find_kernel (kernel_name ),
1173+ cpp_defs = cppd )
1174+
1175+ # compile kernel
1176+ module = SourceModule (kernel_txt , options = ['--use_fast_math' ])
1177+
1178+ func_name = 'sparse_bls_kernel_simple' if use_simple else 'sparse_bls_kernel'
1179+ kernel = module .get_function (func_name )
1180+
1181+ # Don't use prepare() - it causes issues with large shared memory
1182+ return kernel
1183+
1184+
1185+ def sparse_bls_gpu (t , y , dy , freqs , ignore_negative_delta_sols = False ,
1186+ block_size = 64 , max_ndata = None ,
1187+ stream = None , kernel = None ):
1188+ """
1189+ GPU-accelerated sparse BLS implementation.
1190+
1191+ Uses a CUDA kernel to test all pairs of observations as potential
1192+ transit boundaries. More efficient than CPU implementation for datasets
1193+ with ~100-1000 observations.
1194+
1195+ Based on https://arxiv.org/abs/2103.06193
1196+
1197+ Parameters
1198+ ----------
1199+ t: array_like, float
1200+ Observation times
1201+ y: array_like, float
1202+ Observations
1203+ dy: array_like, float
1204+ Observation uncertainties
1205+ freqs: array_like, float
1206+ Frequencies to test
1207+ ignore_negative_delta_sols: bool, optional (default: False)
1208+ Whether or not to ignore solutions with negative delta (inverted dips)
1209+ block_size: int, optional (default: 64)
1210+ CUDA threads per CUDA block (use 32-128 for best performance)
1211+ max_ndata: int, optional (default: None)
1212+ Maximum number of data points (for shared memory allocation).
1213+ If None, uses len(t)
1214+ stream: pycuda.driver.Stream, optional (default: None)
1215+ CUDA stream for async execution
1216+ kernel: PyCUDA function, optional (default: None)
1217+ Pre-compiled kernel. If None, compiles kernel automatically.
1218+
1219+ Returns
1220+ -------
1221+ bls_powers: array_like, float
1222+ BLS power at each frequency
1223+ solutions: list of (q, phi0) tuples
1224+ Best (q, phi0) solution at each frequency
1225+ """
1226+ # Convert to numpy arrays
1227+ t = np .asarray (t ).astype (np .float32 )
1228+ y = np .asarray (y ).astype (np .float32 )
1229+ dy = np .asarray (dy ).astype (np .float32 )
1230+ freqs = np .asarray (freqs ).astype (np .float32 )
1231+
1232+ ndata = len (t )
1233+ nfreqs = len (freqs )
1234+
1235+ if max_ndata is None :
1236+ max_ndata = ndata
1237+
1238+ # Compile kernel if not provided
1239+ if kernel is None :
1240+ kernel = compile_sparse_bls (block_size = block_size )
1241+
1242+ # Allocate GPU memory
1243+ t_g = gpuarray .to_gpu (t )
1244+ y_g = gpuarray .to_gpu (y )
1245+ dy_g = gpuarray .to_gpu (dy )
1246+ freqs_g = gpuarray .to_gpu (freqs )
1247+
1248+ bls_powers_g = gpuarray .zeros (nfreqs , dtype = np .float32 )
1249+ best_q_g = gpuarray .zeros (nfreqs , dtype = np .float32 )
1250+ best_phi_g = gpuarray .zeros (nfreqs , dtype = np .float32 )
1251+
1252+ # Calculate shared memory size
1253+ # Simple kernel needs: 3 data arrays (phi, y, w) + 1 temp array for reductions
1254+ # Allocate for blockDim from function parameter (block_size) to be safe
1255+ shared_mem_size = (3 * max_ndata + block_size ) * 4
1256+
1257+ # Launch kernel
1258+ # Grid: one block per frequency (or fewer if limited by hardware)
1259+ max_blocks = 65535 # CUDA maximum
1260+ grid = (min (nfreqs , max_blocks ), 1 )
1261+ block = (block_size , 1 , 1 )
1262+
1263+ if stream is None :
1264+ stream = cuda .Stream ()
1265+
1266+ # Call kernel without prepare() to avoid resource issues
1267+ kernel (
1268+ t_g , y_g , dy_g , freqs_g ,
1269+ np .uint32 (ndata ), np .uint32 (nfreqs ),
1270+ np .uint32 (ignore_negative_delta_sols ),
1271+ bls_powers_g , best_q_g , best_phi_g ,
1272+ block = block , grid = grid , stream = stream ,
1273+ shared = shared_mem_size
1274+ )
1275+
1276+ # Copy results back
1277+ stream .synchronize ()
1278+ bls_powers = bls_powers_g .get ()
1279+ best_q = best_q_g .get ()
1280+ best_phi = best_phi_g .get ()
1281+
1282+ solutions = list (zip (best_q , best_phi ))
1283+ return bls_powers , solutions
1284+
1285+
11531286def eebls_transit (t , y , dy , fmax_frac = 1.0 , fmin_frac = 1.0 ,
11541287 qmin_fac = 0.5 , qmax_fac = 2.0 , fmin = None ,
11551288 fmax = None , freqs = None , qvals = None , use_fast = False ,
0 commit comments