CUDA
C/C++    Fortran   

examples/getting_started/convolve.cpp

#include <stdio.h>
#include <arrayfire.h>
using namespace af;

// use static variables at file scope so timeit() wrapper functions
// can reference image/kernels

// image to convolve
static array img;

// 5x5 derivative with separable kernels
static float h_dx[] = {1.f / 12, -8.f / 12, 0, 8.f / 12, -1.f / 12}; // five point stencil
static float h_spread[] = {1.f / 5, 1.f / 5, 1.f / 5, 1.f / 5, 1.f / 5};
static array dx, spread, kernel; // device kernels

static array full_out, dsep_out, hsep_out; // save output for value checks
// wrapper functions for timeit() below
static void full() { full_out = convolve(img, kernel); }
static void dsep() { dsep_out = convolve(dx, spread, img); }
static void hsep() { hsep_out = convolve(5, h_dx, 5, h_spread, img); }

static bool fail(array &left, array &right) {
    return (alltrue<bool>(abs(left - right) < 1e-6));
}

int main(int argc, char** argv) {
    try {
        // setup image and device copies of kernels
        img = randu(640, 480);
        dx = array(5, 1, h_dx); // 5x1 kernel
        spread = array(1, 5, h_spread); // 1x5 kernel
        kernel = matmul(dx, spread); // 5x5 kernel

        printf("full 2D convolution:         %g seconds\n", timeit(full));
        printf("separable, device pointers:  %g seconds\n", timeit(dsep));
        printf("separable, host pointers:    %g seconds\n", timeit(hsep));

        // ensure values are all the same across versions
        if (fail(full_out, dsep_out)) { throw af::exception("full != dsep"); }
        if (fail(full_out, hsep_out)) { throw af::exception("full != hsep"); }
        if (fail(dsep_out, hsep_out)) { throw af::exception("dsep != hsep"); }

    } catch (af::exception& e) {
        fprintf(stderr, "%s\n", e.what());
    }

#ifdef WIN32 // pause in Windows
    if (!(argc == 2 && argv[1][0] == '-')) {
        printf("hit [enter]...");
        fflush(stdout);
        getchar();
    }
#endif
    return 0;
}