/*
 * Copyright (c) 1997 Massachusetts Institute of Technology
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to use, copy, modify, and distribute the Software without
 * restriction, provided the Software, including any modified copies made
 * under this license, is not distributed for a fee, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE MASSACHUSETTS INSTITUTE OF TECHNOLOGY BE LIABLE
 * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * Except as contained in this notice, the name of the Massachusetts
 * Institute of Technology shall not be used in advertising or otherwise
 * to promote the sale, use or other dealings in this Software without
 * prior written authorization from the Massachusetts Institute of
 * Technology.
 *
 */

#include <stdio.h>
#include <math.h>

#include <fftw_threads.h>

#define NUM_ITER 20000000L

#define N_TESTS_1D 16
#define N_TESTS_3D 9

extern void initialize_fft_data(FFTW_COMPLEX * arr, long n);

int main(int argc, char **argv)
{
    int nthreads = 0;
    int n1[N_TESTS_1D] =
    {
	16,
	32,
	64,
	128,
	256,
	512,
	1024,
	2048,
	4096,
	8192,
	16384,
	32768,
	65536,
	131072,
	262144,
	524288
    };
    int n3[N_TESTS_3D][3] = 
    {
	 { 16, 16, 16 },
	 { 24, 24, 24 },
	 { 32, 32, 32 },
	 { 49, 49, 49 },
	 { 64, 64, 64 },
	 { 80, 80, 80 },
	 {100,100,100 },
	 { 128, 128, 128 },
         {256, 256, 256 },
    };
    int i, test, iter, max_iter;
    fftw_time start_t, end_t, init_t;
    FFTW_COMPLEX *cin, *out;
    double time_scale,time1,time2;
    int max_size;
    fftw_plan plan;
    fftwnd_plan plan_nd;

     if (argc > 1)
	  nthreads = atoi(argv[1]);

     if (nthreads < 1) {
	  printf("Usage: time_threads nthreads\n"
		 "  -- benchmarks fftw_threads and fftwnd_threads "
		 "against fftw and fftwnd\n"
		 "     using nthreads parallel threads of execution.\n");
	  return 1;
     }

     fftw_threads_init();

    /*************** Benchmark fftw_threads ****************/
    max_size = 0;
    for (i = 0; i < N_TESTS_1D; ++i)
	if (n1[i] > max_size)
	    max_size = n1[i];

    cin = fftw_malloc(max_size * sizeof(FFTW_COMPLEX));
    out = fftw_malloc(max_size * sizeof(FFTW_COMPLEX));

    if (!cin || !out) {
	printf("Not enough memory!  At least %d bytes needed.\n",
	       max_size * sizeof(FFTW_COMPLEX) * 2);
	exit(1);
    }
    printf("%15s%20s%20s%20s\n", "Array Size", "FFTW", "FFTW_THREADS",
	   "Speedup Factor");

    for (test = 0; test < N_TESTS_1D; ++test) {
	plan = fftw_create_plan(n1[test], FFTW_FORWARD, FFTW_MEASURE);

	max_iter = NUM_ITER / (n1[test] * log(2.0 * n1[test]));

	if (max_iter < 1)
	     max_iter = 1;

	time_scale = 1.0e6 / (max_iter * (log(n1[test])/log(2.0) * n1[test]));

	initialize_fft_data(cin, n1[test]);
	start_t = fftw_get_time();
	for (iter = 0; iter < max_iter; ++iter)
	    initialize_fft_data(cin, n1[test]);
	end_t = fftw_get_time();
	init_t = fftw_time_diff(end_t,start_t);

	printf("%15d", n1[test]);
	fflush(stdout);

	/* Time FFTW: */

	initialize_fft_data(cin, n1[test]);
	fftw(plan, 1, cin, 1, 0, out, 1, 0);
	start_t = fftw_get_time();
	for (iter = 0; iter < max_iter; ++iter) {
	    initialize_fft_data(cin, n1[test]);
	    fftw(plan, 1, cin, 1, 0, out, 1, 0);
	}
	end_t = fftw_get_time();
	printf("%20g", time1=fftw_time_to_sec(fftw_time_diff(fftw_time_diff(end_t,start_t),init_t)) *
                       time_scale);
	fflush(stdout);

	/* Time threads FFTW: */

	initialize_fft_data(cin, n1[test]);
	fftw_threads(nthreads, plan, 1, cin, 1, 0, out, 1, 0);
	start_t = fftw_get_time();
	for (iter = 0; iter < max_iter; ++iter) {
	    initialize_fft_data(cin, n1[test]);
	    fftw_threads(nthreads, plan, 1, cin, 1, 0, out, 1, 0);
	}
	end_t = fftw_get_time();
	printf("%20g", time2=fftw_time_to_sec(fftw_time_diff(fftw_time_diff(end_t,start_t),init_t)) *
                       time_scale);
	printf("%20g\n",time1/time2);
	fflush(stdout);

	fftw_destroy_plan(plan);
    }

    fftw_free(cin);
    fftw_free(out);

    /*************** Benchmark fftwnd_threads ****************/
    printf("\n");

    max_size = 0;
    for (i = 0; i < N_TESTS_3D; ++i)
	if (n3[i][0]*n3[i][1]*n3[i][2] > max_size)
	    max_size = n3[i][0]*n3[i][1]*n3[i][2];

    cin = fftw_malloc(max_size * sizeof(FFTW_COMPLEX));

    if (!cin) {
	printf("Not enough memory!  At least %d bytes needed.\n",
	       max_size * sizeof(FFTW_COMPLEX));
	exit(1);
    }
    printf("%15s%20s%20s%20s\n", "Array Size", "FFTWND", "FFTWND_THREADS",
	   "Speedup Factor");

    for (test = 0; test < N_TESTS_3D; ++test) {
	int N;

	plan_nd = fftwnd_create_plan(3,n3[test], FFTW_FORWARD, 
				     FFTW_IN_PLACE | FFTW_MEASURE);

	N = n3[test][0]*n3[test][1]*n3[test][2];

	max_iter = NUM_ITER / (N * log(2.0 * N));

	if (max_iter < 1)
	     max_iter = 1;

	time_scale = 1.0e6 / (max_iter * (log(N)/log(2.0) * N));

	initialize_fft_data(cin, N);
	start_t = fftw_get_time();
	for (iter = 0; iter < max_iter; ++iter)
	    initialize_fft_data(cin, N);
	end_t = fftw_get_time();
	init_t = fftw_time_diff(end_t,start_t);

	{
	     char s[20];
	     sprintf(s,"%dx%dx%d",n3[test][0],n3[test][1],n3[test][2]);
	     printf("%15s",s);
	}
	fflush(stdout);

	/* Time FFTW: */

	initialize_fft_data(cin, N);
	fftwnd(plan_nd, 1, cin, 1, 0, out, 1, 0);
	start_t = fftw_get_time();
	for (iter = 0; iter < max_iter; ++iter) {
	    initialize_fft_data(cin, N);
	    fftwnd(plan_nd, 1, cin, 1, 0, out, 1, 0);
	}
	end_t = fftw_get_time();
	printf("%20g", time1=fftw_time_to_sec(fftw_time_diff(fftw_time_diff(end_t,start_t),init_t)) *
                       time_scale);
	fflush(stdout);

	/* Time Threads FFTW: */

	initialize_fft_data(cin, N);
	fftwnd_threads(nthreads, plan_nd, 1, cin, 1, 0, 0, 0, 0);
	start_t = fftw_get_time();
	for (iter = 0; iter < max_iter; ++iter) {
	    initialize_fft_data(cin, N);
	    fftwnd_threads(nthreads, plan_nd, 1, cin, 1, 0, 0, 0, 0);
	}
	end_t = fftw_get_time();
	printf("%20g", time2=fftw_time_to_sec(fftw_time_diff(fftw_time_diff(end_t,start_t),init_t)) *
                       time_scale);

	/* Done. */

	printf("%20g\n",time1/time2);
	fflush(stdout);

	fftwnd_destroy_plan(plan_nd);
    }

    fftw_free(cin);

    return 0;
}

void initialize_fft_data(FFTW_COMPLEX * arr, long n)
{
    long i;

    for (i = 0; i < n; i++) { /* initialize to some arbitrary values: */
	c_re(arr[i]) = 0.56923456;
	c_im(arr[i]) = 0.23858572;
    }
}
