forked from aFuerst/PyCUDA-Raster
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpuCalc.py
501 lines (441 loc) · 22.5 KB
/
gpuCalc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
from multiprocessing import Process, Pipe
import numpy as np
from gpustruct import GPUStruct
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
"""
GPUCalculator
Class that takes and sends data from pipes and goes GPU calculations on it
designed to run as a separate process and inherits from Process module
currently supported functions: slope, aspect, hillshade
copyright : (C) 2016 by Alex Feurst, Charles Kazer, William Hoffman
/***************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
***************************************************************************/
"""
class GPUCalculator(Process):
"""
__init__
paramaters:
header - six-tuple header expected to be in this order: (ncols, nrows, cellsize, NODATA, xllcorner, yllcorner)
_input_pipe - a Pipe object to read information from
_outputPipe - a Pipe object to send information to
function_types - list of strings that are supported function names as strings
creates empty instance variables needed later
"""
def __init__(self, header, _input_pipe, _output_pipes, function_types):
Process.__init__(self)
self.input_pipe = _input_pipe
self.output_pipes = _output_pipes
self.functions = function_types
self.header = header
#--------------------------------------------------------------------------#
"""
_unpackInfo
gets data from header and creates carry_over_rows needed in _processData
"""
def _unpackInfo(self):
#unpack header info
self.totalCols = self.header[0]
self.totalRows = self.header[1]
self.cellsize = self.header[2]
self.NODATA = self.header[3]
#carry over rows used to insert last two lines of data from one page
#as first two lines in next page
self.carry_over_rows = [np.full(shape=self.totalCols, fill_value=self.NODATA, dtype=np.float32) \
, np.empty(shape=self.totalCols, dtype=np.float32)] # second array does not need to be filled with NODATA
self.np_copy_arr = [i for i in range(self.totalCols)]
#--------------------------------------------------------------------------#
"""
run
Overrides default Process.run()
Given a kernel type, retrieves the C code for that kernel, and runs the
data processing loop
does CUDA initialization and sets local device and context
"""
def run(self):
import pycuda.autoinit
self._unpackInfo()
self._gpuAlloc()
# pre-compile requested kernels outside loop, only do it once this way
compiled_kernels = []
for function in self.functions:
kernel = self._getKernel(function)
compiled_kernels.append(kernel.get_function("raster_function"))
#Process data while we continue to receive input
processed_rows = 0
while self._recvData(processed_rows):
#Copy input data to GPU
cuda.memcpy_htod(self.data_gpu, self.to_gpu_buffer)
for i in range(len(compiled_kernels)):
# pass in compiled kernel for execution
self._processData(compiled_kernels[i])
#Get data back from GPU
cuda.memcpy_dtoh(self.from_gpu_buffer, self.result_gpu)
self._writeData(processed_rows, self.output_pipes[i])
processed_rows += (self.maxPossRows-2) # -2 because of buffer rows
print "Page done... %.3f %% completed" % ((float(processed_rows) / float(self.totalRows)) * 100)
#Process remaining data in buffer
cuda.memcpy_htod(self.data_gpu, self.to_gpu_buffer)
for i in range(len(self.functions)):
self._processData(compiled_kernels[i])
cuda.memcpy_dtoh(self.from_gpu_buffer, self.result_gpu)
self._writeData(processed_rows, self.output_pipes[i])
print "GPU calculations finished"
for pipe in self.output_pipes:
pipe.close()
# clean up on GPU
self.data_gpu.free()
self.result_gpu.free()
cuda.Context.pop()
#--------------------------------------------------------------------------#
"""
_gpuAlloc
determines how much free memory is on the GPU and allocates as much as needed
creates pagelocked buffers of equal size to GPU memory
"""
def _gpuAlloc(self):
#Get GPU information
self.freeMem = cuda.mem_get_info()[0] * .5 * .8 # limit memory use to 80% of available
self.maxPossRows = np.int(np.floor(self.freeMem / (4 * self.totalCols))) # multiply by 4 as that is size of float
# set max rows to smaller number to save memory usage
if self.totalRows < self.maxPossRows:
print "reducing max rows to reduce memory use on GPU"
self.maxPossRows = self.totalRows
# create pagelocked buffers and GPU arrays
self.to_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32)
self.from_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32)
self.data_gpu = cuda.mem_alloc(self.to_gpu_buffer.nbytes)
self.result_gpu = cuda.mem_alloc(self.from_gpu_buffer.nbytes)
#--------------------------------------------------------------------------#
"""
_recvData
Receives a page worth of data from the input pipe. The input pipe comes
from dataLoader.py. Copies over 2 rows from the previous page so the GPU
kernel computation works correctly.
If the pipe closes, fill the rest of the page with NODATA, and return false
to indicate that we should break out of the processing loop.
"""
def _recvData(self, count):
if count == 0:
#If this is the first page, insert a buffer row
np.put(self.to_gpu_buffer[0], self.np_copy_arr, self.carry_over_rows[0])
row_count = 1
else:
#otherwise, insert carry over rows from last page
np.put(self.to_gpu_buffer[0], self.np_copy_arr, self.carry_over_rows[0])
np.put(self.to_gpu_buffer[1], self.np_copy_arr, self.carry_over_rows[1])
row_count = 2
if count + row_count + self.maxPossRows > self.totalRows and row_count > 1: # this is the last page
try:
while row_count + count < self.totalRows: # get rest of rows from pipe
np.put(self.to_gpu_buffer[row_count], self.np_copy_arr, self.input_pipe.recv())
row_count += 1
#Pipe was closed unexpectedly
except EOFError:
print "Pipe closed unexpectedly."
self.stop()
self.to_gpu_buffer[row_count].fill(self.NODATA)
return False # finished receiving data, tell run to end
else: # this is not the last page
try:
while row_count < self.maxPossRows: # get max poss rows from pipe
np.put(self.to_gpu_buffer[row_count], self.np_copy_arr, self.input_pipe.recv())
row_count += 1
#Pipe was closed unexpectedly
except EOFError:
print "Pipe closed unexpectedly."
self.stop()
#Update carry over rows
np.put(self.carry_over_rows[0], self.np_copy_arr, self.to_gpu_buffer[self.maxPossRows-2])
np.put(self.carry_over_rows[1], self.np_copy_arr, self.to_gpu_buffer[self.maxPossRows-1])
return True # not finished reveiving data, tell run to keep looping
#--------------------------------------------------------------------------#
"""
_processData
Using the given kernel code packed in mod, allocates memory on the GPU,
and runs the kernel.
"""
def _processData(self, func):
#GPU layout information
grid = (256,256)
block = (32,32,1)
num_blocks = grid[0] * grid[1]
threads_per_block = block[0]*block[1]*block[2]
pixels_per_thread = (self.maxPossRows * self.totalCols) / (threads_per_block * num_blocks)
# minimize work by each thread while makeing sure each pixel is calculated
while pixels_per_thread < 3:
grid = (grid[0] - 16,grid[1] - 16)
num_blocks = grid[0] * grid[1]
pixels_per_thread = (self.maxPossRows * self.totalCols) / (threads_per_block * num_blocks)
pixels_per_thread = np.ceil(pixels_per_thread)
#information struct passed to GPU
stc = GPUStruct([
(np.uint64, 'pixels_per_thread', pixels_per_thread),
(np.float64, 'NODATA', self.NODATA),
(np.uint64, 'ncols', self.totalCols),
(np.uint64, 'nrows', self.maxPossRows),
(np.uint64, 'npixels', self.maxPossRows*self.totalCols),
(np.float32, 'cellSize', self.cellsize)
])
stc.copy_to_gpu()
#Call GPU kernel
func(self.data_gpu, self.result_gpu, stc.get_ptr(), block=block, grid=grid)
#--------------------------------------------------------------------------#
"""
_writeData
Writes results to output pipe. This pipe goes to dataSaver.py
"""
def _writeData(self, count, out_pipe):
if count + (self.maxPossRows-1) > self.totalRows:
r = self.totalRows - (count-1)
else:
r = self.maxPossRows-1
for row in range(1, r):
out_pipe.send(self.from_gpu_buffer[row])
#--------------------------------------------------------------------------#
"""
stop
Alerts the thread that it needs to quit
Cleans up CUDA and pipes
"""
def stop(self):
print "Stopping gpuCalc..."
try:
self.data_gpu.free() # free pagelocked memory
self.result_gpu.free()
except:
pass
cuda.Context.pop() # remove CUDA context
for pipe in self.output_pipes:
pipe.close() # close all pipes
self.input_pipe.close()
exit(1)
#--------------------------------------------------------------------------#
"""
_getRasterFunc
Given a string representing the raster function to calculate,
returns the required code to append to the CUDA kernel.
"""
# NOTE: The kernel code in _getKernel only supports functions that are based
# on a 3x3 grid of neighbors.
#
# To add your own computation, add an if statement looking for it, and return
# a tuple containg the C code for your function surrounded by triple quotes,
# and how that function should be called within the code in _getKernel.
#
# Possible parameters you can use from _getKernel:
# float *nbhd /* this is the 3x3 grid of neighbors */
# float dz_dx
# float dz_dy
# unsigned long long file_info->pixels_per_thread
# double file_info->NODATA
# unsigned long long file_info->ncols
# unsigned long long file_info->nrows
# unsigned long long file_info->npixels
# float file_info->cellSize
def _getRasterFunc(self, func):
if func == "slope":
return (\
"""
/*
GPU only function that calculates slope for a pixel
*/
__device__ float slope(float dz_dx, float dz_dy){
return atan(sqrt(pow(dz_dx, 2) + pow(dz_dy, 2)));
}
""",\
"""
slope(dz_dx, dz_dy)
""")
elif func == "aspect":
return (\
"""
/*
GPU only function that calculates aspect for a pixel
*/
__device__ float aspect(float dz_dx, float dz_dy, double NODATA){
float aspect = 57.29578 * (atan2(dz_dy, -(dz_dx)));
if(dz_dx == NODATA || dz_dy == NODATA || (dz_dx == 0.0 && dz_dy == 0.0)){
return NODATA;
} else{
if(aspect > 90.0){
aspect = 360.0 - aspect + 90.0;
} else {
aspect = 90.0 - aspect;
}
aspect = aspect * (M_PI / 180.0);
return aspect;
}
}
""",\
"""
aspect(dz_dx, dz_dy, file_info->NODATA)
""")
elif func == "hillshade":
return (self._getRasterFunc('slope')[0] + \
"""
/*
GPU only function that calculates aspect for a pixel
to be ONLY used by hillshade
*/
__device__ float hillshade_aspect(float dz_dx, float dz_dy){
float aspect;
if(dz_dx != 0){
aspect = atan2(dz_dy, -(dz_dx));
if(aspect < 0){
aspect = ((2 * M_PI) + aspect);
}
} else if(dz_dx == 0){
if(dz_dy > 0){
aspect = (M_PI / 2);
}else if(dz_dy < 0){
aspect = ((2 * M_PI) - (M_PI / 2));
}else{
aspect = atan2(dz_dy, -(dz_dx));
}
}
return aspect;
}
/*
GPU only function that calculates hillshade for a pixel
*/
__device__ float hillshade(float dz_dx, float dz_dy){
/* calc slope and aspect */
float slp = slope(dz_dx, dz_dy);
float asp = hillshade_aspect(dz_dx, dz_dy);
/* calc zenith */
float altitude = 45;
float zenith_deg = 90 - altitude;
float zenith_rad = zenith_deg * (M_PI / 180.0);
/* calc azimuth */
float azimuth = 315;
float azimuth_math = (360 - azimuth + 90);
if(azimuth_math >= 360.0){
azimuth_math = azimuth_math - 360;
}
float azimuth_rad = (azimuth_math * M_PI / 180.0);
float hs = 255.0 * ( ( cos(zenith_rad) * cos(slp) ) + ( sin(zenith_rad) * sin(slp) * cos(azimuth_rad - asp) ) );
if(hs < 0){
return 0;
} else {
return hs;
}
}
""",\
"""
hillshade(dz_dx, dz_dy)
""")
else:
print "Function %s not implemented" % func
self.stop()
#raise NotImplementedError
#--------------------------------------------------------------------------#
"""
_getKernel
Packages the kernel module. This kernel assumes that the raster calculations
will be based on the dx_dz and dy_dz values, which are calculated from a 3x3
grid surrounding the current pixel.
"""
# NOTE: To create another raster function, you must create an additional
# entry in _getRasterCalc. Currently only supports calculations based on a
# 3x3 grid surrounding a pixel.
# The GPUCalculator class is set up to automatically insert buffer rows at
# the beginning and end of the file so that all rows are calculated correctly.
def _getKernel(self, funcType):
func_def, func_call = self._getRasterFunc(funcType)
mod = SourceModule("""
#include <math.h>
#include <stdio.h>
#ifndef M_PI
#define M_PI 3.14159625
#endif
typedef struct{
/* struct representing the relevant data passed in by host */
unsigned long long pixels_per_thread;
double NODATA;
unsigned long long ncols;
unsigned long long nrows;
unsigned long long npixels;
float cellSize;
} passed_in;
/************************************************************************************************
GPU only function that gets the neighbors of the pixel at offset
stores them in the passed-by-reference array 'store'
************************************************************************************************/
__device__ int getKernel(float *store, float *data, unsigned long offset, passed_in *file_info){
//NOTE: This is more or less appropriated from Liam's code. Treats edge rows and columns
// as buffers, they will be dropped.
if (offset < file_info->ncols || offset >= (file_info->npixels - file_info->ncols)){
return 1;
}
unsigned long y = offset % file_info->ncols; //FIXME: I'm not sure why this works...
if (y == (file_info->ncols - 1) || y == 0){
return 1;
}
// Grab neighbors above and below.
store[1] = data[offset - file_info->ncols];
store[7] = data[offset + file_info->ncols];
// Grab right side neighbors.
store[2] = data[offset - file_info->ncols + 1];
store[5] = data[offset + 1];
store[8] = data[offset + file_info->ncols + 1];
// Grab left side neighbors.
store[0] = data[offset - file_info->ncols - 1];
store[3] = data[offset - 1];
store[6] = data[offset + file_info->ncols - 1];
/* return a value otherwise it throws a warning expression not having effect */
return 0;
}
"""\
+ func_def + \
"""
/************************************************************************************************
CUDA Kernel function to calculate the slope of pixels in 'data' and stores them in 'result'
handles a variable number of calculations based on its thread/block location
and the size of pixels_per_thread in file_info
************************************************************************************************/
__global__ void raster_function(float *data, float *result, passed_in *file_info){
/* get individual thread x,y values */
unsigned long long x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned long long y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned long long offset = (gridDim.x*blockDim.x) * y + x;
//gridDim.x * blockDim.x is the width of the grid in threads. This moves us to the correct
//block and thread.
unsigned long long i;
/* list to store 3x3 kernel each pixel needs to calc slope */
float nbhd[9];
/* iterate over assigned pixels and calculate slope for all of them */
/* do npixels + 1 to make last row(s) get done */
for(i=0; i < file_info -> pixels_per_thread + 1 && offset < file_info -> npixels; ++i){
if(data[offset] == file_info -> NODATA){
result[offset] = file_info -> NODATA;
} else {
int q = getKernel(nbhd, data, offset, file_info);
if (q) {
result[offset] = file_info->NODATA;
} else {
for(q = 0; q < 9; ++q){
if(nbhd[q] == file_info -> NODATA){
nbhd[q] = data[offset];
}
}
float dz_dx = (nbhd[2] + (2*nbhd[5]) + nbhd[8] - (nbhd[0] + (2*nbhd[3]) + nbhd[6])) / (8 * file_info -> cellSize);
float dz_dy = (nbhd[6] + (2*nbhd[7]) + nbhd[8] - (nbhd[0] + (2*nbhd[1]) + nbhd[2])) / (8 * file_info -> cellSize);
result[offset] =""" + func_call + """;
}
}
offset += (gridDim.x*blockDim.x) * (gridDim.y*blockDim.y);
//Jump to next row
}
}
""")
return mod
if __name__=="__main__":
pass