-
Notifications
You must be signed in to change notification settings - Fork 6
/
hasher.cu
810 lines (649 loc) · 27.9 KB
/
hasher.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
/* Copyright (C) 2013 David G. Andersen. All rights reserved.
*
* Use of this code is covered under the Apache 2.0 license, which
* can be found in the file "LICENSE"
*/
#include <sys/time.h>
#include "hasher.h"
#include "scrypt_cores.cu"
/* write_keys writes the 8 keys being processed by a warp to the global
* scratchpad. To effectively use memory bandwidth, it performs the writes
* (and reads, for read_keys) 128 bytes at a time per memory location
* by __shfl'ing the 4 entries in bx to the threads in the next-up
* thread group. It then has eight threads together perform uint4
* (128 bit) writes to the destination region. This seems to make
* quite effective use of memory bandwidth. An approach that spread
* uint32s across more threads was slower because of the increased
* computation it required.
*
* "start" is the loop iteration producing the write - the offset within
* the block's memory.
*
* Internally, this algorithm first __shfl's the 4 bx entries to
* the next up thread group, and then uses a conditional move to
* ensure that odd-numbered thread groups exchange the b/bx ordering
* so that the right parts are written together.
*
* Thanks to Babu for helping design the 128-bit-per-write version.
*
* _direct lets the caller specify the absolute start location instead of
* the relative start location, as an attempt to reduce some recomputation.
*/
__device__
inline void write_keys_direct(const uint32_t b[4], const uint32_t bx[4], uint32_t *scratch, uint32_t start) {
uint4 t, t2;
t.x = b[0]; t.y = b[1]; t.z = b[2]; t.w = b[3];
int target_thread = (threadIdx.x + 4)%32;
t2.x = __shfl((int)bx[0], target_thread);
t2.y = __shfl((int)bx[1], target_thread);
t2.z = __shfl((int)bx[2], target_thread);
t2.w = __shfl((int)bx[3], target_thread);
int t2_start = __shfl((int)start, target_thread) + 4;
bool c = (threadIdx.x & 0x4);
int loc = c ? t2_start : start;
*((uint4 *)(&scratch[loc])) = (c ? t2 : t);
loc = c ? start : t2_start;
*((uint4 *)(&scratch[loc])) = (c ? t : t2);
}
__device__
inline void write_keys(const uint32_t b[4], const uint32_t bx[4], uint32_t *scratch, uint32_t start) {
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_SCRYPT_BLOCK;
start = scrypt_block*SCRYPT_SCRATCH_PER_BLOCK + (32*start) + 8*(threadIdx.x%4);
write_keys_direct(b, bx, scratch, start);
}
inline __device__ void read_xor_keys_direct(uint32_t b[4], uint32_t bx[4], const __restrict__ uint32_t *scratch, uint32_t start) {
uint4 t, t2;
// Tricky bit: We do the work on behalf of thread+4, but then when
// we steal, we have to steal from (thread+28)%32 to get the right
// stuff back.
start = __shfl((int)start, (threadIdx.x & 0x7c)) + 8*(threadIdx.x%4);
int target_thread = (threadIdx.x + 4)%32;
int t2_start = __shfl((int)start, target_thread) + 4;
bool c = (threadIdx.x & 0x4);
int loc = c ? t2_start : start;
t = *((uint4 *)(&scratch[loc]));
loc = c ? start : t2_start;
t2 = *((uint4 *)(&scratch[loc]));
uint4 tmp = t; t = (c ? t2 : t); t2 = (c ? tmp : t2);
b[0] ^= t.x; b[1] ^= t.y; b[2] ^= t.z; b[3] ^= t.w;
int steal_target = (threadIdx.x + 28)%32;
bx[0] ^= __shfl((int)t2.x, steal_target);
bx[1] ^= __shfl((int)t2.y, steal_target);
bx[2] ^= __shfl((int)t2.z, steal_target);
bx[3] ^= __shfl((int)t2.w, steal_target);
}
inline __device__ void read_xor_keys(uint32_t b[4], uint32_t bx[4], const __restrict__ uint32_t *scratch, uint32_t start) {
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_SCRYPT_BLOCK;
start = scrypt_block*SCRYPT_SCRATCH_PER_BLOCK + (32*start);
read_xor_keys_direct(b, bx, scratch, start);
}
inline __device__ void primary_order_shuffle(uint32_t b[4], uint32_t bx[4]) {
/* Inner loop shuffle targets */
int x1_target_lane = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+1)&0x3);
int x2_target_lane = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+2)&0x3);
int x3_target_lane = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+3)&0x3);
b[3] = __shfl((int)b[3], x1_target_lane);
b[2] = __shfl((int)b[2], x2_target_lane);
b[1] = __shfl((int)b[1], x3_target_lane);
uint32_t tmp = b[1]; b[1] = b[3]; b[3] = tmp;
bx[3] = __shfl((int)bx[3], x1_target_lane);
bx[2] = __shfl((int)bx[2], x2_target_lane);
bx[1] = __shfl((int)bx[1], x3_target_lane);
tmp = bx[1]; bx[1] = bx[3]; bx[3] = tmp;
}
/*
* load_key loads a 32*32bit key from a contiguous region of memory in B.
* The input keys are in external order (i.e., 0, 1, 2, 3, ...).
* After loading, each thread has its four b and four bx keys stored
* in internal processing order.
*/
inline __device__ void load_key(const uint32_t *B, uint32_t b[4], uint32_t bx[4]) {
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_SCRYPT_BLOCK;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
// Read in permuted order. Key loads are not our bottleneck right now.
for (int i = 0; i < 4; i++) {
b[i] = B[key_offset + 4*thread_in_block + (thread_in_block+i)%4];
bx[i] = B[key_offset + 4*thread_in_block + (thread_in_block+i)%4 + 16];
}
primary_order_shuffle(b, bx);
}
/*
* store_key performs the opposite transform as load_key, taking
* internally-ordered b and bx and storing them into a contiguous
* region of B in external order.
*/
inline __device__ void store_key(uint32_t *B, uint32_t b[4], uint32_t bx[4]) {
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_SCRYPT_BLOCK;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
primary_order_shuffle(b, bx);
for (int i = 0; i < 4; i++) {
B[key_offset + 4*thread_in_block + (thread_in_block+i)%4] = b[i];
B[key_offset + 4*thread_in_block + (thread_in_block+i)%4 + 16] = bx[i];
}
}
/*
* salsa_xor_core does the equivalent of the xor_salsa8 loop from
* tarsnap's implementation of scrypt. The original scrypt called:
* xor_salsa8(&X[0], &X[16]); <-- the "b" loop
* xor_salsa8(&X[16], &X[0]); <-- the "bx" loop
* This version is unrolled to handle both of these loops in a single
* call to avoid unnecessary data movement.
*/
inline __device__ void salsa_xor_core(uint32_t b[4], uint32_t bx[4], uint32_t x[4],
const int x1_target_lane,
const int x2_target_lane,
const int x3_target_lane) {
uint32_t tmp;
#pragma unroll
for (int i = 0; i < 4; i++) {
b[i] ^= bx[i];
x[i] = b[i];
}
#define XOR_ROTATE_ADD(dst, s1, s2, amt) do { tmp = x[s1]+x[s2]; x[dst] ^= ((tmp<<amt)|(tmp>>(32-amt))); } while(0)
// Enter in "column" mode (t0 has 0, 4, 8, 12)
for (int j = 0; j < 4; j++) {
// Mixing phase of salsa
XOR_ROTATE_ADD(1, 0, 3, 7);
XOR_ROTATE_ADD(2, 1, 0, 9);
XOR_ROTATE_ADD(3, 2, 1, 13);
XOR_ROTATE_ADD(0, 3, 2, 18);
/* Transpose rows and columns. */
/* Unclear if this optimization is needed: These are ordered based
* upon the dependencies needed in the later xors. Compiler should be
* able to figure this out, but might as well give it a hand. */
x[1] = __shfl((int)x[1], x3_target_lane);
x[3] = __shfl((int)x[3], x1_target_lane);
x[2] = __shfl((int)x[2], x2_target_lane);
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
* but the register targets are rewritten here to swap x[1] and x[3] so that
* they can be directly shuffled to and from our peer threads without
* reassignment. The reverse shuffle then puts them back in the right place.
*/
XOR_ROTATE_ADD(3, 0, 1, 7);
XOR_ROTATE_ADD(2, 3, 0, 9);
XOR_ROTATE_ADD(1, 2, 3, 13);
XOR_ROTATE_ADD(0, 1, 2, 18);
x[3] = __shfl((int)x[3], x3_target_lane);
x[1] = __shfl((int)x[1], x1_target_lane);
x[2] = __shfl((int)x[2], x2_target_lane);
}
for (int i = 0; i < 4; i++) {
b[i] += x[i];
// The next two lines are the beginning of the BX-centric loop iteration
bx[i] ^= b[i];
x[i] = bx[i];
}
// This is a copy of the same loop above, identical but stripped of comments.
// Duplicated so that we can complete a bx-based loop with fewer register moves.
for (int j = 0; j < 4; j++) {
XOR_ROTATE_ADD(1, 0, 3, 7);
XOR_ROTATE_ADD(2, 1, 0, 9);
XOR_ROTATE_ADD(3, 2, 1, 13);
XOR_ROTATE_ADD(0, 3, 2, 18);
x[1] = __shfl((int)x[1], x3_target_lane);
x[3] = __shfl((int)x[3], x1_target_lane);
x[2] = __shfl((int)x[2], x2_target_lane);
XOR_ROTATE_ADD(3, 0, 1, 7);
XOR_ROTATE_ADD(2, 3, 0, 9);
XOR_ROTATE_ADD(1, 2, 3, 13);
XOR_ROTATE_ADD(0, 1, 2, 18);
x[3] = __shfl((int)x[3], x3_target_lane);
x[1] = __shfl((int)x[1], x1_target_lane);
x[2] = __shfl((int)x[2], x2_target_lane);
}
// At the end of these iterations, the data is in primary order again.
#undef XOR_ROTATE_ADD
for (int i = 0; i < 4; i++) {
bx[i] += x[i];
}
}
/*
* The hasher_gen_kernel operates on a group of 1024-bit input keys
* in B, stored as:
* B = { k1B k1Bx k2B k2Bx ... }
* and fills up the scratchpad with the iterative hashes derived from
* those keys:
* scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... }
* scratch is 1024 times larger than the input keys B.
* It is extremely important to stream writes effectively into scratch;
* less important to coalesce the reads from B.
*
* Key ordering note: Keys are input from B in "original" order:
* K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 }
* After inputting into kernel_gen, each component k and kx of the
* key is transmuted into a permuted internal order to make processing faster:
* K = k, kx with:
* k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11
* and similarly for kx.
*/
__global__
void hasher_gen_kernel(__restrict__ uint32_t *B, __restrict__ uint32_t *scratch) {
/* Each thread operates on four of the sixteen B and Bx variables. Thus,
* each key is processed by four threads in parallel. salsa_scrypt_core
* internally shuffles the variables between threads (and back) as
* needed.
*/
uint32_t b[4], bx[4], x[4];
load_key(B, b, bx);
/* Inner loop shuffle targets */
int x1_target_lane = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+1)&0x3);
int x2_target_lane = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+2)&0x3);
int x3_target_lane = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+3)&0x3);
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_SCRYPT_BLOCK;
int start = scrypt_block*SCRYPT_SCRATCH_PER_BLOCK + 8*(threadIdx.x%4);
for (int i = 0; i < 1024; i++) {
write_keys_direct(b, bx, scratch, start+32*i);
salsa_xor_core(b, bx, x, x1_target_lane, x2_target_lane, x3_target_lane);
}
store_key(B, b, bx);
}
/*
* hasher_hash_kernel runs the second phase of scrypt after the scratch
* buffer is filled with the iterative hashes: It bounces through
* the scratch buffer in pseudorandom order, mixing the key as it goes.
*/
__global__
void hasher_hash_kernel(__restrict__ uint32_t *B, const __restrict__ uint32_t *scratch) {
/* Each thread operates on a group of four variables that must be processed
* together. Shuffle between threaads in a warp between iterations.
*/
uint32_t b[4], bx[4], x[4];
load_key(B, b, bx);
/* Inner loop shuffle targets */
int x1_target_lane = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+1)&0x3);
int x2_target_lane = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+2)&0x3);
int x3_target_lane = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+3)&0x3);
for (int i = 0; i < 1024; i++) {
// Bounce through the key space and XOR the new keys in.
// Critical thing: (X[16] & 1023) tells us the next slot to read.
// X[16] in the original is bx[0]
int slot = bx[0] & 1023;
read_xor_keys(b, bx, scratch, slot);
salsa_xor_core(b, bx, x, x1_target_lane, x2_target_lane, x3_target_lane);
}
store_key(B, b, bx);
}
/*
* hasher_combo_kernel runs the functions of both hasher_gen_kernel
* and hasher_hash_kernel in a single invocation. It is
* designed to reduce kernel launch downtime a bit and omit one
* intermediate store_key operation to global memory. This is faster on
* my GT 550m, but seems a bit slower on a Tesla, probably because one of
* the two individual kernels can use fewer registers alone.
*/
__global__
void hasher_combo_kernel(__restrict__ uint32_t *B, __restrict__ uint32_t *scratch) {
uint32_t b[4], bx[4], x[4];
load_key(B, b, bx);
int x1_target_lane = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+1)&0x3);
int x2_target_lane = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+2)&0x3);
int x3_target_lane = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+3)&0x3);
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_SCRYPT_BLOCK;
int start = scrypt_block*SCRYPT_SCRATCH_PER_BLOCK + 8*(threadIdx.x%4);
for (int i = 0; i < 1024; i++) {
write_keys_direct(b, bx, scratch, start);
start += 32;
salsa_xor_core(b, bx, x, x1_target_lane, x2_target_lane, x3_target_lane);
}
start = scrypt_block*SCRYPT_SCRATCH_PER_BLOCK;
for (int i = 0; i < 1024; i++) {
int slot = bx[0] & 1023;
read_xor_keys_direct(b, bx, scratch, start+32*slot);
salsa_xor_core(b, bx, x, x1_target_lane, x2_target_lane, x3_target_lane);
}
store_key(B, b, bx);
}
/*
* scrypt_hash_start_kernel takes a job description (job) and a starting nonce number (n_base)
* and generates (batchsize) starting hashes using HMAC_SHA256 and PBKDF2_SHA256.
* This is the first step in scrypt computation before the salsa core executions
* that introduce the memory difficulty.
*
* This function stores three outputs:
* output - the 1024-bit intermediate state fed to scrypt_core
* tstate, ostate - intermediate PBKDF2 state that needs to be used again
* after the execution of scrypt_core.
*/
__global__
void scrypt_hash_start_kernel(const __restrict__ scan_job *job, __restrict__ uint32_t *output, __restrict__ uint32_t *ostate_out, __restrict__ uint32_t *tstate_out, uint32_t n_base) {
uint32_t tstate[8];
uint32_t ostate[8];
uint32_t data[20];
int blockid = (blockIdx.x*blockDim.x + threadIdx.x);
/* data: the input.
* tstate, ostate, output must have sufficient space
* -> batchsize * 8 * sizeof(uint32_t)
*/
/* Trivial implementation: Each thread processes one key. This is lame, but
* PBKDF related processing is only about 3.5% of runtime right now. */
const uint32_t *in_data = job->data;
uint64_t blockstart = blockid*32;
{
uint4 d;
for (int i = 0; i < 20; i+= 4) {
d = *(uint4 *)&in_data[i];
data[i] = d.x; data[i+1] = d.y; data[i+2] = d.z; data[i+3] = d.w;
}
}
data[19] = n_base + blockid;
read_8_as_uint4(job->initial_midstate, tstate);
dev_HMAC_SHA256_80_init(data, tstate, ostate);
/* This writes directly to output and does no shuffling or cleverness
* to coalesce writes. Unnecessary memory transactions, but not worth
* fixing yet - PBKDF is less than 4% of runtime overall still. */
dev_PBKDF2_SHA256_80_128(tstate, ostate, data, &output[blockstart]);
/* Write out (and read back) tstate and ostate interleaved across
threads to improve mem b/w. Easy change, though small benefit. */
int bigblockstart = (blockIdx.x*blockDim.x)*8;
int threadsInBlock = blockDim.x;
for (int i = 0; i < 8; i++) {
tstate_out[bigblockstart + threadsInBlock*i + threadIdx.x ] = tstate[i];
ostate_out[bigblockstart + threadsInBlock*i + threadIdx.x ] = ostate[i];
}
}
/*
* scrypt_hash_finish_kernel takes the output state from scrypt_core and
* recombines it with the saved PBKDF2 tstate and ostate from
* scrypt_hash_start_kernel to produce an output key.
*
* It then compares the output key ("hash") to the target number
* specified in the job to determine whether hash < job->target.
* If it is, it puts its block/thread ID + 1 into *dev_output.
* If it is not, it does nothing. The caller should ensure that
* dev_output is zero before the call to scrypt_hash_finish_kernel, and
* should subtract one from non-zero output to determine the
* actual thread ID (and thus, the nonce used for hashing).
*
* This method does not guarantee that the lowest-numbered or
* smallest acceptable output is returned, merely that one satisfying
* output is returned if any exist.
*/
__global__ void scrypt_hash_finish_kernel(const __restrict__ uint32_t *dev_keys, __restrict__ uint32_t *dev_tstate, __restrict__ uint32_t *dev_ostate, __restrict__ uint32_t *dev_output, __restrict__ const scan_job *job) {
uint32_t tstate[8];
uint32_t ostate[8];
uint32_t hash[32];
int blockid = (blockIdx.x*blockDim.x + threadIdx.x);
int bigblockstart = blockIdx.x*blockDim.x*8;
int threadsInBlock = blockDim.x;
/* As in start_kernel, reads tstate/ostate interleaved between threads */
for (int i = 0; i < 8; i++) {
tstate[i] = dev_tstate[bigblockstart + threadsInBlock*i + threadIdx.x ];
ostate[i] = dev_ostate[bigblockstart + threadsInBlock*i + threadIdx.x ];
}
uint64_t blockstart = blockid*32;
uint4 t;
for (int i = 0; i < 32; i+= 4) {
t = *(uint4 *)&dev_keys[blockstart+i];
hash[i] = t.x; hash[i+1] = t.y; hash[i+2] = t.z; hash[i+3] = t.w;
}
dev_PBKDF2_SHA256_128_32(tstate, ostate, hash);
uint32_t foundit = 0x00000000;
uint32_t maybe = 0xffffffff;
uint32_t target[8];
read_8_as_uint4(job->target, target);
for (int j = 7; j >= 0; j--) {
uint32_t tmp = swab32(ostate[j]);
maybe = (maybe & (tmp <= target[j]));
foundit = (foundit | (maybe & (tmp < target[j])));
}
foundit = foundit ? (blockid+1) : foundit;
if (foundit) {
// Finding the lowest doesn't matter. Just let the first writer win.
uint32_t oldval = atomicCAS(&dev_output[0], 0, foundit);
}
}
/* Unit test kernel to expose loading, writing, reading, and storing keys */
__global__
void test_load_store_kernel(__restrict__ uint32_t *B, __restrict__ uint32_t *scratch) {
uint32_t b[4], bx[4];
load_key(B, b, bx);
for (int i = 0; i < 4; i++) { b[i]++; bx[i]++; }
for (int slot = 0; slot < 1024; slot++) {
write_keys(b, bx, scratch, slot);
for (int i = 0; i < 4; i++) { b[i] = bx[i] = 0; }
read_xor_keys(b, bx, scratch, slot);
}
store_key(B, b, bx);
}
/*
* The CudaHasher constructor does nothing.
* You must call Initialize() and check the error code.
*/
CudaHasher::CudaHasher() :
dev_keys(NULL), dev_scratch(NULL), dev_output(NULL),
dev_tstate(NULL), dev_ostate(NULL), scan_output(NULL),
dev_job(NULL) {
// dev_keys = NULL;
// dev_scratch = NULL;
}
/*
* Initialize() sets up the GPU state.
* Between creation and destruction, a lot of memory
* may be eaten on the GPU. For performance, it is
* _not_ freed when ComputeHashes is not running.
* ergo: If you're going to be idle for a long time
* and want to be nice to other uses of the GPU, destroy
* the CudaHasher and create a new one.
*
* If initialize fails, destroy the object to reset the GPU.
*/
int CudaHasher::Initialize() {
cudaError_t error;
/* Stop eating CPU while waiting for results! */
error = cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
if (error != cudaSuccess) {
fprintf(stderr, "Could not set blocking sync (error %d)\n", error);
}
/* Determine device memory to size batch */
size_t free, total;
cudaMemGetInfo(&free, &total);
//printf("Initializing. Device has %ld free of %ld total bytes of memory\n", free, total);
int mem_per_job = 160000; /* A little conservative */
int max_batchsize = free / mem_per_job;
/* We need 4 threads per work unit and each block should have 192 threads */
/* The number of blocks should also probably be a multiple of the
* number of multiprocessors. I'm using 8 here as a simple way
* to get a pretty-OK answer that's probably not optimal for big GPUs */
batchsize = (max_batchsize/(2*THREADS_PER_CUDA_BLOCK))*2*THREADS_PER_CUDA_BLOCK;
n_blocks = (batchsize*THREADS_PER_SCRYPT_BLOCK/THREADS_PER_CUDA_BLOCK);
error = cudaMalloc((void **) &dev_job, sizeof(scan_job));
if (error != cudaSuccess) {
fprintf(stderr, "Could not allocate CUDA array, error code %d, line(%d)\n", error, __LINE__);
dev_job = NULL;
return -1;
}
error = cudaMalloc((void **) &dev_keys, sizeof(scrypt_hash) * batchsize);
if (error != cudaSuccess) {
fprintf(stderr, "Could not allocate CUDA array, error code %d, line(%d)\n", error, __LINE__);
dev_keys = NULL;
return -1;
}
size_t scratchBufSize = sizeof(scrypt_hash)*1024*batchsize;
error = cudaMalloc((void **) &dev_scratch, scratchBufSize);
if (error != cudaSuccess) {
fprintf(stderr, "Could not allocate CUDA array, error code %d, line(%d)\n", error, __LINE__);
dev_scratch = NULL;
return -1;
}
/* dev_output holds one int per warp indicating if a thread in that warp solved the block */
error = cudaMalloc((void **) &dev_output, sizeof(uint32_t));
if (error != cudaSuccess) {
fprintf(stderr, "Could not allocate CUDA array, error code %d, line(%d)\n", error, __LINE__);
dev_output = NULL;
return -1;
}
error = cudaMalloc((void **) &dev_tstate, 8*sizeof(uint32_t) * batchsize);
if (error != cudaSuccess) {
fprintf(stderr, "Could not allocate CUDA array, error code %d, line(%d)\n", error, __LINE__);
dev_tstate = NULL;
return -1;
}
error = cudaMalloc((void **) &dev_ostate, 8 * sizeof(uint32_t) * batchsize);
if (error != cudaSuccess) {
fprintf(stderr, "Could not allocate CUDA array, error code %d, line(%d)\n", error, __LINE__);
dev_ostate = NULL;
return -1;
}
scan_output = (uint32_t *)malloc(sizeof(uint32_t));
if (!scan_output) {
perror("Could not allocate scan output buffer (host)");
return -1;
}
cudaFuncSetCacheConfig(hasher_hash_kernel, cudaFuncCachePreferL1);
cudaFuncSetCacheConfig(hasher_gen_kernel, cudaFuncCachePreferL1);
cudaFuncSetCacheConfig(hasher_combo_kernel, cudaFuncCachePreferL1);
cudaFuncSetCacheConfig(scrypt_hash_start_kernel, cudaFuncCachePreferL1);
cudaFuncSetCacheConfig(scrypt_hash_finish_kernel, cudaFuncCachePreferL1);
return 0;
}
CudaHasher::~CudaHasher() {
/* Free host memory */
if (scan_output) free(scan_output);
/* Free device memory */
if (dev_scratch != NULL) cudaFree(dev_scratch);
if (dev_keys != NULL) cudaFree(dev_keys);
if (dev_output != NULL) cudaFree(dev_output);
if (dev_tstate != NULL) cudaFree(dev_tstate);
if (dev_ostate != NULL) cudaFree(dev_ostate);
if (dev_job != NULL) cudaFree(dev_job);
cudaDeviceReset();
}
static void init_test_keys(scrypt_hash *keys_in, int n) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < SCRYPT_WIDTH; j++) {
keys_in[i].b[j] = SCRYPT_WIDTH*2*i + j;
}
for (int j = SCRYPT_WIDTH; j < 2*SCRYPT_WIDTH; j++) {
keys_in[i].bx[j-SCRYPT_WIDTH] = SCRYPT_WIDTH*2*i + j;
}
}
}
int verify_test_keys(scrypt_hash *keys_in, int n, std::string testname, int extra) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < SCRYPT_WIDTH; j++) {
if (keys_in[i].b[j] != (SCRYPT_WIDTH*2*i + j + extra)) {
fprintf(stderr, "Failed %s validation of keys_in[%d].b[%d]\n", testname.c_str(), i, j);
fprintf(stderr, "Got %d expected %d\n", keys_in[i].b[j], SCRYPT_WIDTH*2*i+j+extra);
return -1;
}
}
for (int j = SCRYPT_WIDTH; j < 2*SCRYPT_WIDTH; j++) {
if (keys_in[i].bx[j-SCRYPT_WIDTH] != (SCRYPT_WIDTH*2*i + j + extra)) {
fprintf(stderr, "Failed %s validation of keys_in[%d].bx[%d]\n", testname.c_str(), i, j-SCRYPT_WIDTH);
return -1;
}
}
}
return 0;
}
int CudaHasher::TestLoadStore() {
cudaError error;
int success = -1;
scrypt_hash *keys_in = (scrypt_hash *)malloc(batchsize * sizeof(struct scrypt_hash));
if (!keys_in) {
perror("TestLoadStore: Could not allocate host keys");
goto failed;
}
init_test_keys(keys_in, batchsize);
if (verify_test_keys(keys_in, batchsize, "pre-kernel", 0) == -1) {
goto failed;
}
error = cudaMemcpy(dev_keys, keys_in, sizeof(scrypt_hash) * batchsize, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "TestLoadStore: Could not memcpy to device, error %d\n", error);
goto failed;
}
test_load_store_kernel<<<n_blocks, THREADS_PER_CUDA_BLOCK>>>(dev_keys, dev_scratch);
error = cudaMemcpy(keys_in, dev_keys, batchsize * sizeof(scrypt_hash), cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
fprintf(stderr, "TestLoadStore: Could not memcpy from device, error %d\n", error);
goto failed;
}
if (verify_test_keys(keys_in, batchsize, "post-kernel", 1) == -1) { goto failed; }
fprintf(stderr, "TestLoadStore passed\n");
success = 0;
failed:
free(keys_in);
return success;
}
/*
* The simplest workhorse of our computation: takes a set of n_hashes which
* and computes the output. Note that if n_hashes is not equal to the batchsize,
* only n_hashes will be computed. If it is less, then random hashes will be computed
* alongside to keep the batch full. Version 1, remember?
*
* Returns when complete.
*
* Only one thread should call ComputeHashes at a time on any given device.
*/
int CudaHasher::ComputeHashes(const scrypt_hash *in, scrypt_hash *out, int n_hashes)
{
cudaError error = cudaMemcpy(dev_keys, in, sizeof(scrypt_hash) * n_hashes, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Could not memcpy to device, error code %d, line(%d)\n", error, __LINE__);
return -1;
}
// hasher_gen_kernel<<<n_blocks, THREADS_PER_CUDA_BLOCK>>>(dev_keys, dev_scratch);
// hasher_hash_kernel<<<n_blocks, THREADS_PER_CUDA_BLOCK>>>(dev_keys, dev_scratch);
hasher_combo_kernel<<<n_blocks, THREADS_PER_CUDA_BLOCK>>>(dev_keys, dev_scratch);
cudaDeviceSynchronize();
error = cudaMemcpy(out, dev_keys, n_hashes * sizeof(scrypt_hash), cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
fprintf(stderr, "Could not memcpy from device, error code %d, line(%d)\n", error, __LINE__);
return -1;
}
return 0;
}
int CudaHasher::ScanNCoins(uint32_t *pdata, const uint32_t *ptarget, int n_hashes, volatile int *stop, unsigned long *hashes_done)
{
int n_done = 0;
uint32_t data[20];
uint32_t n = pdata[19]; /* sigh, cpuminer */
memcpy(data, pdata, sizeof(data));
scan_job j;
memcpy(j.data, data, sizeof(data));
memcpy(j.target, ptarget, sizeof(uint32_t)*8);
/* Set up the job midstate once on the CPU */
sha256_init(j.initial_midstate);
sha256_transform(j.initial_midstate, data, 0);
*scan_output = 0;
cudaError error = cudaMemcpy(dev_job, &j, sizeof(scan_job), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Could not memcpy job to device, error code %d, line(%d)\n", error, __LINE__);
exit(-1);
}
error = cudaMemcpy(dev_output, scan_output, sizeof(uint32_t), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Could not memcpy job to device, error code %d, line(%d)\n", error, __LINE__);
exit(-1);
}
while (n_done < n_hashes && (!*stop)) {
data[19] = n;
// Note: The /4 is very important below: TPCB is set for the scrypt kernel;
// the hashing kernel uses 1 thread per hash.
static const int threads = THREADS_PER_CUDA_BLOCK;
scrypt_hash_start_kernel<<<n_blocks/4, threads>>>(dev_job, dev_keys, dev_ostate, dev_tstate, n);
hasher_combo_kernel<<<n_blocks, threads>>>(dev_keys, dev_scratch);
scrypt_hash_finish_kernel<<<n_blocks/4, threads>>>(dev_keys, dev_tstate, dev_ostate, dev_output, dev_job);
error = cudaDeviceSynchronize();
if (error != cudaSuccess) {
fprintf(stderr, "Kernel execution failed, error code %d, line(%d)\n", error, __LINE__);
exit(-1);
}
error = cudaMemcpy(scan_output, dev_output, sizeof(uint32_t), cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
fprintf(stderr, "Could not memcpy from device, error code %d, line(%d)\n", error, __LINE__);
exit(-1);
}
if (*scan_output != 0) {
if (hashes_done != NULL) *hashes_done += n_done;
return n_done + *scan_output - 1; // -1 because of 0 blockIdx
}
n_done += batchsize;
n += batchsize;
}
if (hashes_done != NULL) *hashes_done += n_done;
return -1;
}