doc/vtestbed/opencl_2fourier__transform_8hh_source.html

 #ifndef VTESTBED_OPENCL_FOURIER_TRANSFORM_HH
 #define VTESTBED_OPENCL_FOURIER_TRANSFORM_HH

 #include <complex>
 #include <iosfwd>
 #include <string>
 #include <vector>

 #include <openclx/forward>

 #include <vtestbed/base/blitz.hh>
 #include <vtestbed/opencl/opencl.hh>

 namespace vtb {

     namespace opencl {

         template <int N>
         inline blitz::TinyVector<int,3>
         make_shape(const blitz::TinyVector<int,N>& rhs) {
             static_assert(0 < N && N <= 3, "bad N");
             blitz::TinyVector<int,3> result;
             result = 1;
             for (int i=0; i<N; ++i) {
                 result(i) = rhs(i);
             }
             return result;
         }

         template <>
         inline blitz::TinyVector<int,3>
         make_shape<3>(const blitz::TinyVector<int,3>& rhs) {
             return rhs;
         }

         template <int N>
         inline blitz::TinyVector<int,N>
         reduce_shape(const blitz::TinyVector<int,3>& rhs) {
             static_assert(0 < N && N <= 3, "bad N");
             blitz::TinyVector<int,N> result;
             for (int i=0; i<N; ++i) {
                 result(i) = rhs(i);
             }
             return result;
         }

         template <>
         inline blitz::TinyVector<int,3>
         reduce_shape<3>(const blitz::TinyVector<int,3>& rhs) {
             return rhs;
         }

         enum class Fourier_transform_format {
             Split_complex = 0,
             Interleaved_complex = 1
         };

         struct Kernel_info {
             clx::kernel kernel;
             std::string name;
             int lmem_size = 0;
             int num_workgroups = 0;
             int num_xforms_per_workgroup = 0;
             int num_workitems_per_workgroup = 0;
             int axis = 0;
             bool in_place_possible = 0;
         };

         class Fourier_transform_base {

         public:
             typedef blitz::TinyVector<int,3> int3;
             typedef blitz::TinyVector<int,2> int2;
             typedef blitz::TinyVector<int,1> int1;

         private:
             Context* _context = nullptr;
             int3 _shape{0,0,0};
             int _ndimensions = 1;
             Fourier_transform_format _format = Fourier_transform_format::Interleaved_complex;
             std::string _src;
             std::vector<Kernel_info> _kernels;

             // flag indicating if temporary intermediate buffer is needed or not.
             // this depends on fft kernels being executed and if transform is
             // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
             // one that does not require global transpose do not need temporary buffer)
             // 2D 1024x1024 out-of-place fft however do require intermediate buffer.
             // If temp buffer is needed, its allocation is lazy i.e. its not allocated
             // until its needed
             bool temp_buffer_needed = false;

             // Batch size is runtime parameter and size of temporary buffer (if needed)
             // depends on batch size. Allocation of temporary buffer is lazy i.e. its
             // only created when needed. Once its created at first call of clFFT_Executexxx
             // it is not allocated next time if next time clFFT_Executexxx is called with
             // batch size different than the first call. last_batch_size caches the last
             // batch size with which this plan is used so that we dont keep allocating/deallocating
             // temp buffer if same batch size is used again and again.
             int last_batch_size = 0;

             // temporary buffer for interleaved plan
             clx::buffer _workarea{nullptr};

             // Maximum size of signal for which local memory transposed based
             // fft is sufficient i.e. no global mem transpose (communication)
             // is needed
             int max_localmem_fft_size = 2048;

             // Maximum work items per work group allowed. This, along with _maxradix below controls
             // maximum local memory being used by fft kernels of this plan. Set to 256 by default
             int _maxworkgroupsize = 256;

             // Maximum base radix for local memory fft ... this controls the maximum register
             // space used by work items. Currently defaults to 16
             int _maxradix = 16;

             // Device depended parameter that tells how many work-items need to be read consecutive
             // values to make sure global memory access by work-items of a work-group result in
             // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
             int min_mem_coalesce_width = 16;

             // Number of local memory banks. This is used to geneate kernel with local memory
             // transposes with appropriate padding to avoid bank conflicts to local memory
             // e.g. on NVidia it is 16.
             int num_local_mem_banks = 16;

             // kernel name index
             int _kindex = 0;

         public:

             Fourier_transform_base() = default;

             explicit
             Fourier_transform_base(const int3& shape);

             void
             enqueue(clx::buffer x, int direction, int batch_size=1);

             inline void
             forward(clx::buffer x, int batch_size=1) {
                 this->enqueue(x, -1, batch_size);
             }

             inline void
             backward(clx::buffer x, int batch_size=1) {
                 this->enqueue(x, 1, batch_size);
             }

             void
             dump(std::ostream& out);

             inline const int3&
             shape() const noexcept {
                 return this->_shape;
             }

             inline void
             shape(const int3& rhs) {
                 if (!blitz::all(this->_shape == rhs)) {
                     this->_shape = rhs;
                     this->init();
                 }
             }

             static void precompile(const int3& max_power, Context* context);

             inline void context(Context* rhs) { this->_context = rhs; }
             inline Context* context() { return this->_context; }

         private:

             inline int
             unique_kernel_index() {
                 return ++this->_kindex;
             }

             std::string
             kernel_name(const char* prefix);

             inline size_t
             buffer_size(int batch_size) const {
                 return blitz::product(this->_shape) *
                     batch_size * 2 * sizeof(cl_float);
             }

             void
             generate_source_code();

             void
             generate_fft(int axis);

             void
             generate_fft_local();

             void
             generate_fft_global(int n, int BS, int axis, int vertBS);

             void
             getKernelWorkDimensions(
                 const Kernel_info& kernel,
                 int* batchSize,
                 size_t* gWorkItems,
                 size_t* lWorkItems
             );

             void
             allocate_temporary_buffer(int batch_size);

             void
             init();

         };

         template <class T, int N>
         class Fourier_transform: private Fourier_transform_base {

             static_assert(std::is_same<std::complex<float>,T>::value, "bad T");
             static_assert(0 < N && N <= 3, "bad N");

         private:
             using base_type = Fourier_transform_base;

         public:
             using shape_type = blitz::TinyVector<int,N>;
             using Fourier_transform_base::context;

         public:

             Fourier_transform() = default;

             explicit
             Fourier_transform(const shape_type& shape):
             base_type(make_shape<N>(shape)) {}

             inline shape_type
             shape() const noexcept {
                 return reduce_shape<N>(this->base_type::shape());
             }

             inline void
             shape(const shape_type& rhs) {
                 this->base_type::shape(make_shape<N>(rhs));
             }

             static inline void
             precompile(const shape_type& shp, Context* context) {
                 base_type::precompile(make_shape<N>(shp), context);
             }

             inline void
             forward(clx::buffer x, int batch_size=1) {
                 this->base_type::forward(x, batch_size);
             }

             inline void
             backward(clx::buffer x, int batch_size=1) {
                 this->base_type::backward(x, batch_size);
             }

             using base_type::dump;

         };

         class Chirp_Z_transform_base {

         private:
             using int3 = blitz::TinyVector<int,3>;
             using C = std::complex<float>;
             using fft_type = Fourier_transform_base;

         private:
             int3 _shape;

         protected:
             fft_type _fft;
             Buffer<C> _chirp, _ichirp, _xp;
             clx::kernel _makechirp, _reciprocal_chirp, _mult1, _mult2, _mult3, _zero_init;

         public:

             void
             enqueue(clx::buffer x, int direction, int batch_size=1);

             void context(Context* rhs);
             inline Context* context() { return this->_fft.context(); }

         protected:

             void
             make_chirp(const int3& shape, const int3& fft_shape);

         };

         template <class T, int N>
         class Chirp_Z_transform: public Chirp_Z_transform_base {

             static_assert(std::is_same<std::complex<float>,T>::value, "bad T");
             static_assert(0 < N && N <= 3, "bad N");

         public:
             typedef blitz::TinyVector<int,N> shape_type;

         private:
             typedef blitz::TinyVector<T,N> vec;
             typedef blitz::RectDomain<N> domain;
             typedef typename T::value_type R;

         private:
             shape_type _shape{0};
             bool _power_of_two = false;

         public:

             Chirp_Z_transform() = default;

             inline explicit
             Chirp_Z_transform(const shape_type& shp) {
                 this->shape(shp);
             }

             inline const shape_type&
             shape() const noexcept {
                 return this->_shape;
             }

             inline void
             shape(const shape_type& rhs) {
                 using blitz::all;
                 if (all(this->_shape == rhs)) {
                     return;
                 }
                 this->_shape = rhs;
                 shape_type new_shape;
                 if (all(blitz::is_power_of_two(rhs))) {
                     new_shape = rhs;
                     _power_of_two = true;
                 } else {
                     new_shape = next_power_of_two(rhs*2 - 1);
                     _power_of_two = false;
                 }
                 auto fft_shape{make_shape<N>(new_shape)};
                 _fft.shape(fft_shape);
                 this->make_chirp(make_shape<N>(_shape), fft_shape);
             }

             inline void
             forward(clx::buffer x) {
                 this->transform(x, -1);
             }

             inline void
             backward(clx::buffer x) {
                 this->transform(x, 1);
             }

             inline void
             transform(clx::buffer x, int dir) {
                 using blitz::product;
                 if (_power_of_two) {
                     _fft.enqueue(x, dir, 1);
                 } else {
                     this->enqueue(x, dir, 1);
                 }
             }

         };

     }

 }

 #endif // vim:filetype=cpp
vtb::opencl::Chirp_Z_transform_base
Definition: opencl/fourier_transform.hh:267

std::is_same

std::complex

std::string

vtb::opencl::Kernel_info
Definition: opencl/fourier_transform.hh:58

vtb::opencl::Buffer
Definition: pipeline.hh:37

vtb
Main namespace.
Definition: convert.hh:9

std::vector

vtb::opencl::Fourier_transform_base::precompile
static void precompile(const int3 &max_power, Context *context)
Compile the code for each power of 2 up to max_power.
Definition: opencl/fourier_transform.cc:1404

vtb::opencl::Fourier_transform_base
Definition: opencl/fourier_transform.hh:69

vtb::opencl::Chirp_Z_transform
Definition: opencl/fourier_transform.hh:298

vtb::opencl::Fourier_transform
Definition: opencl/fourier_transform.hh:218

vtb::opencl::Context
Definition: opencl.hh:30

std::ostream