factor-cuda/factor.cu at master · dghost/factor-cuda · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
/*
 *  CUDA square matrix multiplier
 */

#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string>
#include <stdint.h>

#include <math.h>
#include <cuda.h>

#ifdef _WIN32
#include <windows.h>
#else
#include <unistd.h>
#include <sys/time.h>
#endif

/*
 *  Return elapsed wall time since last call (seconds)
 */
static double t0=0;
double Elapsed(void)
{
#ifdef _WIN32
    //  Windows version of wall time
    LARGE_INTEGER tv,freq;
    QueryPerformanceCounter((LARGE_INTEGER*)&tv);
    QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
    double t = tv.QuadPart/(double)freq.QuadPart;
#else
    //  Unix/Linux/OSX version of wall time
    struct timeval tv;
    gettimeofday(&tv,NULL);
    double t = tv.tv_sec+1e-6*tv.tv_usec;
#endif
    double s = t-t0;
    t0 = t;
    return s;
}

/*
 *  Print message to stderr and exit
 */
void Fatal(const char* format , ...)
{
    va_list args;
    va_start(args,format);
    vfprintf(stderr,format,args);
    va_end(args);
    exit(1);
}

/*
 *  Initialize fastest GPU device
 */
int InitGPU(int verbose)
{
    //  Get number of CUDA devices
    int num;
    if (cudaGetDeviceCount(&num)) Fatal("Cannot get number of CUDA devices\n");
    if (num<1) Fatal("No CUDA devices found\n");

    //  Get fastest device
    cudaDeviceProp prop;
    int   MaxDevice = -1;
    int   MaxGflops = -1;
    for (int dev=0;dev<num;dev++)
    {
        if (cudaGetDeviceProperties(&prop,dev)) Fatal("Error getting device %d properties\n",dev);
        int Gflops = prop.multiProcessorCount * prop.clockRate;
        if (verbose) printf("CUDA Device %d: %s Gflops %f Processors %d Threads/Block %d\n",dev,prop.name,1e-6*Gflops,prop.multiProcessorCount,prop.maxThreadsPerBlock);
        if(Gflops > MaxGflops)
        {
            MaxGflops = Gflops;
            MaxDevice = dev;
        }
    }

    //  Print and set device
    if (cudaGetDeviceProperties(&prop,MaxDevice)) Fatal("Error getting device %d properties\n",MaxDevice);
    printf("Fastest CUDA Device %d: %s\n",MaxDevice,prop.name);
    cudaSetDevice(MaxDevice);

    //  Return max thread count
    return prop.maxThreadsPerBlock;
}

// host version of f(x)
__host__ __forceinline__ int64_t fx(int64_t x, int64_t a, int64_t c) {
    return ( a * x * x + c);
}

// host version of a binary gcd algorithm
__host__ int64_t gcd(int64_t u, int64_t v)
{
    int shift;

    /* GCD(0,v) == v; GCD(u,0) == u, GCD(0,0) == 0 */
    if (u == 0) return v;
    if (v == 0) return u;

    /* Let shift := lg K, where K is the greatest power of 2
     dividing both u and v. */
    for (shift = 0; ((u | v) & 1) == 0; ++shift) {
        u >>= 1;
        v >>= 1;
    }

    while ((u & 1) == 0)
        u >>= 1;

    /* From here on, u is always odd. */
    do {
        /* remove all factors of 2 in v -- they are not common */
        /*   note: v is not zero, so while will terminate */
        while ((v & 1) == 0)  /* Loop X */
            v >>= 1;

        /* Now u and v are both odd. Swap if necessary so u <= v,
         then set v = v - u (which is even). For bignums, the
         swapping is just pointer movement, and the subtraction
         can be done in-place. */
        if (u > v) {
            int64_t t = v; v = u; u = t;}  // Swap u and v.
        v = v - u;                       // Here v >= u.
    } while (v != 0);

    /* restore common factors of 2 */
    return u << shift;
}

// host version of the Pollard's Rho algorithm
__host__ int64_t pollardHost(int64_t num)
{


    int64_t max = sqrt(num);

    // catch easy cases
    if ( num % 2 == 0)
    {
        //       cout << "Found 2" << endl;
        return 2;

    } else if ( num % 3 == 0)
    {
        return 3;
    } else if ( max * max == num)
    {
        return max;
    }

    int64_t result = 0;
    bool quit = false;

    // if OpenMP is enabled, automatically run this block in parallel
    // using variable 'quit' to synchronize
#pragma omp parallel
    {
        //int64_t x = rand() % max + 1;
        int64_t x = 0;
        int64_t a = rand() % (max-1) + 1;
        int64_t c = rand() % (max-1) + 1;
        int64_t y, d, z;

        y = x;
        d = 1;

        do
        {
            x = fx(x,a,c) % num;
            y = fx(fx(y,a,c),a,c) % num;
            z = std::abs(x-y);
            d = gcd(z,num);
        } while (d == 1 && !quit );


        if (d != 1 && d != num )
        {
            quit = true;
            result = d;
        }
    }

    return result;
}

// device version of f(x)
__device__ __forceinline__ int64_t fx_d(int64_t x, int64_t a, int64_t c) {
    return ( a * x * x + c);
}

// device version of binary gcd
__device__ int64_t gcd_d(int64_t u, int64_t v)
{
    int shift;

    /* GCD(0,v) == v; GCD(u,0) == u, GCD(0,0) == 0 */
    if (u == 0) return v;
    if (v == 0) return u;

    /* Let shift := lg K, where K is the greatest power of 2
     dividing both u and v. */
    for (shift = 0; ((u | v) & 1) == 0; ++shift) {
        u >>= 1;
        v >>= 1;
    }

    while ((u & 1) == 0)
        u >>= 1;

    /* From here on, u is always odd. */
    do {
        /* remove all factors of 2 in v -- they are not common */
        /*   note: v is not zero, so while will terminate */
        while ((v & 1) == 0)  /* Loop X */
            v >>= 1;

        /* Now u and v are both odd. Swap if necessary so u <= v,
         then set v = v - u (which is even). For bignums, the
         swapping is just pointer movement, and the subtraction
         can be done in-place. */
        if (u > v) {
            int64_t t = v; v = u; u = t;}  // Swap u and v.
        v = v - u;                       // Here v >= u.
    } while (v != 0);

    /* restore common factors of 2 */
    return u << shift;
}


// CUDA kernel for Pollard's Rho
// Only execute a single pass
__global__ void pollardKernel(int64_t num,int64_t* xd, int64_t* result)
{
	int threadID = blockIdx.x * blockDim.x + threadIdx.x;
    int64_t x,y,a,c,d, z;
    d = 1;

    // copy state variables back into local memory
    x = xd[threadID * 4];
    y = xd[threadID * 4 + 1];
    a = xd[threadID * 4 + 2];
    c = xd[threadID * 4 + 3];

    // execute the pass
    x = fx_d(x,a,c) % num;
    y = fx_d(fx_d(y,a,c),a,c) % num;
    z = abs(x-y);
    d = gcd_d(z,num);

    // copy updated state back into global memory
    xd[threadID * 4] = x;
    xd[threadID * 4 + 1] = y;

    // test to see if it found a factor
    if (d != 1 && d != num )
    {
        // if found, copy it into global syncronization variable "found"
        *result = d;
    }

}

// wrapper that sets up and calls pollardKernel
int64_t pollardDevice(int64_t num, int Bw, int Bn)
{
    //  Calculate matrix dimensions
    int n = 4 * Bw*Bn;
    int N =  n*sizeof(int64_t);

    // local variables
    int64_t max = sqrt(num);
    int64_t result = 0;

    // catch easy cases
    if ( num % 2 == 0)
    {
        return 2;

    } else if ( num % 3 == 0)
    {
        return 3;
    } else if ( max * max == num)
    {
        return max;
    }

    // initialize the state array
    int64_t *x;
    x = (int64_t *) malloc(N);
    if (!x) Fatal("Could not allocate host memory\n");


    for (int i=0 ; i < n ; i += 4)
    {
    	//x[i] = rand() % max + 1;
    	//x[1 + 1] = x[i];

        // set x, y, a, and c for each thread
        x[i] = 0;
        x[i + 1] = 0;
        x[i + 2] = rand() % (max-1) + 1;
    	x[i + 3] = rand() % (max-1) + 1;

    }
    // Allocate device memory
    int64_t* result_d;
    if (cudaMalloc((void**)&result_d,sizeof(int64_t))) Fatal("Cannot allocate device memory result_d\n");
    if (cudaMemcpy(result_d,&result,sizeof(int64_t),cudaMemcpyHostToDevice)) Fatal("Cannot copy result from host to device\n");

    int64_t* Xd;
    if (cudaMalloc((void**)&Xd,N)) Fatal("Cannot allocate device memory Ad\n");
    // do an asychronous copy operation and let the CUDA runtime sort out the details
    if (cudaMemcpyAsync(Xd,x,N,cudaMemcpyHostToDevice)) Fatal("Cannot copy X from host to device\n");

    // run the kernel until it finds a result
    do {
        pollardKernel<<<Bn,Bw>>>(num,Xd,result_d);

    	if (cudaMemcpy(&result,result_d,sizeof(int64_t),cudaMemcpyDeviceToHost)) {
            std::string error = cudaGetErrorString(cudaGetLastError());
            Fatal("%s\n", error.c_str());
        }
    } while (result == 0);

    // if it failed, abort
    if (cudaGetLastError()) Fatal("pollardKernel failed\n");

    //  Free device memory
    cudaFree(Xd);
    cudaFree(result_d);
    return result;
}

/*
 *  main
 */
int main(int argc, char* argv[])
{

    //  Process options
    int opt;
    int verbose=0;
    while ((opt=getopt(argc,argv,"v"))!=-1)
    {
        if (opt=='v')
            verbose++;
        else
            Fatal("Usage: [-v] <block width> <number of blocks>\n");
    }
    argc -= optind;
    argv += optind;


    //  Get width and number of blocks
    if (argc < 3) Fatal("Usage: [-v] <block width> <number of blocks> <numbers to check>\n");
    int Bw = atoi(argv[0]);
    if (Bw<1) Fatal("Block width out of range %d\n",Bw);
    int Bn = atoi(argv[1]);
    if (Bn<1) Fatal("Number of blocks out of range %d\n",Bn);


    //  Total width is block times number of blocks
    int n = Bw*Bn;
    printf("Bw=%d Bn=%d n=%d\n",Bw,Bn,n);

    //  Initialize GPU
    int Mw = InitGPU(verbose);
    if (Mw<Bw) Fatal("Thread count %d exceeds threads per block of %d\n",Bw*Bw,Mw);

    srand(time(NULL));
    double host_total = 0;
    double device_total = 0;

    for (int i = 2; i < argc; i++)
    {
#ifdef __i386__
        int64_t num = atoll(argv[i]);
#elif __amd64__
        int64_t num = atol(argv[i]);
#endif
        // Find factor on host
        Elapsed();
        int64_t res = 0;
        while (res == 0)
            res = pollardHost(num);
#ifdef __i386__
        printf("Host found common factor %lld of %lld\n",res,num);
#elif __amd64__
        printf("Host found common factor %ld of %ld\n",res,num);
#endif
        double Th = Elapsed();

        // test if the host found a valid factor
        bool pass = (num % res == 0);

        //  Find factor on device
        Elapsed();
        res = pollardDevice(num,Bw,Bn);
#ifdef __i386__
        printf("Device found common factor %lld of %lld\n",res,num);
#elif __amd64__
        printf("Device found common factor %ld of %ld\n",res,num);
#endif
        double Td = Elapsed();

        // test if both the host and the device found a valid factor
        pass = pass && (num % res == 0);

        //  Print results
        host_total += Th;
        device_total += Td;
        printf("Host   Time = %6.3f s\n",Th);
        printf("Device Time = %6.3f s\n",Td);
        printf("Speedup     = %.1f\n",Th/Td);
        printf("Result      = %s\n",(pass)?"PASS":"FAIL");
    }

    // print overall statistics
    printf("\nOverall results:\n");
    printf("Host   Time = %6.3f s\n",host_total);
    printf("Device Time = %6.3f s\n",device_total);
    printf("Speedup     = %.1f\n",host_total/device_total);
    printf("\n");

    //  Done
    return 0;
}