Virtual Testbed
Ship dynamics simulator for extreme conditions
opencl/fourier_transform.hh
1 #ifndef VTESTBED_OPENCL_FOURIER_TRANSFORM_HH
2 #define VTESTBED_OPENCL_FOURIER_TRANSFORM_HH
3 
4 #include <complex>
5 #include <iosfwd>
6 #include <string>
7 #include <vector>
8 
9 #include <openclx/forward>
10 
11 #include <vtestbed/base/blitz.hh>
12 #include <vtestbed/opencl/opencl.hh>
13 
14 namespace vtb {
15 
16  namespace opencl {
17 
18  template <int N>
19  inline blitz::TinyVector<int,3>
20  make_shape(const blitz::TinyVector<int,N>& rhs) {
21  static_assert(0 < N && N <= 3, "bad N");
22  blitz::TinyVector<int,3> result;
23  result = 1;
24  for (int i=0; i<N; ++i) {
25  result(i) = rhs(i);
26  }
27  return result;
28  }
29 
30  template <>
31  inline blitz::TinyVector<int,3>
32  make_shape<3>(const blitz::TinyVector<int,3>& rhs) {
33  return rhs;
34  }
35 
36  template <int N>
37  inline blitz::TinyVector<int,N>
38  reduce_shape(const blitz::TinyVector<int,3>& rhs) {
39  static_assert(0 < N && N <= 3, "bad N");
40  blitz::TinyVector<int,N> result;
41  for (int i=0; i<N; ++i) {
42  result(i) = rhs(i);
43  }
44  return result;
45  }
46 
47  template <>
48  inline blitz::TinyVector<int,3>
49  reduce_shape<3>(const blitz::TinyVector<int,3>& rhs) {
50  return rhs;
51  }
52 
53  enum class Fourier_transform_format {
54  Split_complex = 0,
55  Interleaved_complex = 1
56  };
57 
58  struct Kernel_info {
59  clx::kernel kernel;
60  std::string name;
61  int lmem_size = 0;
62  int num_workgroups = 0;
63  int num_xforms_per_workgroup = 0;
64  int num_workitems_per_workgroup = 0;
65  int axis = 0;
66  bool in_place_possible = 0;
67  };
68 
70 
71  public:
72  typedef blitz::TinyVector<int,3> int3;
73  typedef blitz::TinyVector<int,2> int2;
74  typedef blitz::TinyVector<int,1> int1;
75 
76  private:
77  Context* _context = nullptr;
78  int3 _shape{0,0,0};
79  int _ndimensions = 1;
80  Fourier_transform_format _format = Fourier_transform_format::Interleaved_complex;
81  std::string _src;
82  std::vector<Kernel_info> _kernels;
83 
84  // flag indicating if temporary intermediate buffer is needed or not.
85  // this depends on fft kernels being executed and if transform is
86  // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
87  // one that does not require global transpose do not need temporary buffer)
88  // 2D 1024x1024 out-of-place fft however do require intermediate buffer.
89  // If temp buffer is needed, its allocation is lazy i.e. its not allocated
90  // until its needed
91  bool temp_buffer_needed = false;
92 
93  // Batch size is runtime parameter and size of temporary buffer (if needed)
94  // depends on batch size. Allocation of temporary buffer is lazy i.e. its
95  // only created when needed. Once its created at first call of clFFT_Executexxx
96  // it is not allocated next time if next time clFFT_Executexxx is called with
97  // batch size different than the first call. last_batch_size caches the last
98  // batch size with which this plan is used so that we dont keep allocating/deallocating
99  // temp buffer if same batch size is used again and again.
100  int last_batch_size = 0;
101 
102  // temporary buffer for interleaved plan
103  clx::buffer _workarea{nullptr};
104 
105  // Maximum size of signal for which local memory transposed based
106  // fft is sufficient i.e. no global mem transpose (communication)
107  // is needed
108  int max_localmem_fft_size = 2048;
109 
110  // Maximum work items per work group allowed. This, along with _maxradix below controls
111  // maximum local memory being used by fft kernels of this plan. Set to 256 by default
112  int _maxworkgroupsize = 256;
113 
114  // Maximum base radix for local memory fft ... this controls the maximum register
115  // space used by work items. Currently defaults to 16
116  int _maxradix = 16;
117 
118  // Device depended parameter that tells how many work-items need to be read consecutive
119  // values to make sure global memory access by work-items of a work-group result in
120  // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
121  int min_mem_coalesce_width = 16;
122 
123  // Number of local memory banks. This is used to geneate kernel with local memory
124  // transposes with appropriate padding to avoid bank conflicts to local memory
125  // e.g. on NVidia it is 16.
126  int num_local_mem_banks = 16;
127 
128  // kernel name index
129  int _kindex = 0;
130 
131  public:
132 
133  Fourier_transform_base() = default;
134 
135  explicit
136  Fourier_transform_base(const int3& shape);
137 
138  void
139  enqueue(clx::buffer x, int direction, int batch_size=1);
140 
141  inline void
142  forward(clx::buffer x, int batch_size=1) {
143  this->enqueue(x, -1, batch_size);
144  }
145 
146  inline void
147  backward(clx::buffer x, int batch_size=1) {
148  this->enqueue(x, 1, batch_size);
149  }
150 
151  void
152  dump(std::ostream& out);
153 
154  inline const int3&
155  shape() const noexcept {
156  return this->_shape;
157  }
158 
159  inline void
160  shape(const int3& rhs) {
161  if (!blitz::all(this->_shape == rhs)) {
162  this->_shape = rhs;
163  this->init();
164  }
165  }
166 
168  static void precompile(const int3& max_power, Context* context);
169 
170  inline void context(Context* rhs) { this->_context = rhs; }
171  inline Context* context() { return this->_context; }
172 
173  private:
174 
175  inline int
176  unique_kernel_index() {
177  return ++this->_kindex;
178  }
179 
181  kernel_name(const char* prefix);
182 
183  inline size_t
184  buffer_size(int batch_size) const {
185  return blitz::product(this->_shape) *
186  batch_size * 2 * sizeof(cl_float);
187  }
188 
189  void
190  generate_source_code();
191 
192  void
193  generate_fft(int axis);
194 
195  void
196  generate_fft_local();
197 
198  void
199  generate_fft_global(int n, int BS, int axis, int vertBS);
200 
201  void
202  getKernelWorkDimensions(
203  const Kernel_info& kernel,
204  int* batchSize,
205  size_t* gWorkItems,
206  size_t* lWorkItems
207  );
208 
209  void
210  allocate_temporary_buffer(int batch_size);
211 
212  void
213  init();
214 
215  };
216 
217  template <class T, int N>
219 
220  static_assert(std::is_same<std::complex<float>,T>::value, "bad T");
221  static_assert(0 < N && N <= 3, "bad N");
222 
223  private:
225 
226  public:
227  using shape_type = blitz::TinyVector<int,N>;
228  using Fourier_transform_base::context;
229 
230  public:
231 
232  Fourier_transform() = default;
233 
234  explicit
235  Fourier_transform(const shape_type& shape):
236  base_type(make_shape<N>(shape)) {}
237 
238  inline shape_type
239  shape() const noexcept {
240  return reduce_shape<N>(this->base_type::shape());
241  }
242 
243  inline void
244  shape(const shape_type& rhs) {
245  this->base_type::shape(make_shape<N>(rhs));
246  }
247 
248  static inline void
249  precompile(const shape_type& shp, Context* context) {
250  base_type::precompile(make_shape<N>(shp), context);
251  }
252 
253  inline void
254  forward(clx::buffer x, int batch_size=1) {
255  this->base_type::forward(x, batch_size);
256  }
257 
258  inline void
259  backward(clx::buffer x, int batch_size=1) {
260  this->base_type::backward(x, batch_size);
261  }
262 
263  using base_type::dump;
264 
265  };
266 
268 
269  private:
270  using int3 = blitz::TinyVector<int,3>;
271  using C = std::complex<float>;
273 
274  private:
275  int3 _shape;
276 
277  protected:
278  fft_type _fft;
279  Buffer<C> _chirp, _ichirp, _xp;
280  clx::kernel _makechirp, _reciprocal_chirp, _mult1, _mult2, _mult3, _zero_init;
281 
282  public:
283 
284  void
285  enqueue(clx::buffer x, int direction, int batch_size=1);
286 
287  void context(Context* rhs);
288  inline Context* context() { return this->_fft.context(); }
289 
290  protected:
291 
292  void
293  make_chirp(const int3& shape, const int3& fft_shape);
294 
295  };
296 
297  template <class T, int N>
299 
300  static_assert(std::is_same<std::complex<float>,T>::value, "bad T");
301  static_assert(0 < N && N <= 3, "bad N");
302 
303  public:
304  typedef blitz::TinyVector<int,N> shape_type;
305 
306  private:
307  typedef blitz::TinyVector<T,N> vec;
308  typedef blitz::RectDomain<N> domain;
309  typedef typename T::value_type R;
310 
311  private:
312  shape_type _shape{0};
313  bool _power_of_two = false;
314 
315  public:
316 
317  Chirp_Z_transform() = default;
318 
319  inline explicit
320  Chirp_Z_transform(const shape_type& shp) {
321  this->shape(shp);
322  }
323 
324  inline const shape_type&
325  shape() const noexcept {
326  return this->_shape;
327  }
328 
329  inline void
330  shape(const shape_type& rhs) {
331  using blitz::all;
332  if (all(this->_shape == rhs)) {
333  return;
334  }
335  this->_shape = rhs;
336  shape_type new_shape;
337  if (all(blitz::is_power_of_two(rhs))) {
338  new_shape = rhs;
339  _power_of_two = true;
340  } else {
341  new_shape = next_power_of_two(rhs*2 - 1);
342  _power_of_two = false;
343  }
344  auto fft_shape{make_shape<N>(new_shape)};
345  _fft.shape(fft_shape);
346  this->make_chirp(make_shape<N>(_shape), fft_shape);
347  }
348 
349  inline void
350  forward(clx::buffer x) {
351  this->transform(x, -1);
352  }
353 
354  inline void
355  backward(clx::buffer x) {
356  this->transform(x, 1);
357  }
358 
359  inline void
360  transform(clx::buffer x, int dir) {
361  using blitz::product;
362  if (_power_of_two) {
363  _fft.enqueue(x, dir, 1);
364  } else {
365  this->enqueue(x, dir, 1);
366  }
367  }
368 
369  };
370 
371  }
372 
373 }
374 
375 #endif // vim:filetype=cpp
Main namespace.
Definition: convert.hh:9
static void precompile(const int3 &max_power, Context *context)
Compile the code for each power of 2 up to max_power.