Skip to content

Instantly share code, notes, and snippets.

@ezietsman
Last active August 26, 2020 20:42
Show Gist options
  • Save ezietsman/226473 to your computer and use it in GitHub Desktop.
Save ezietsman/226473 to your computer and use it in GitHub Desktop.
Python, Cython, Fortran f2py and OpenCL versions of a Deeming periodogram
#!/bin/bash
f2py -c -m deeming periodogram.f90 -lgomp
f2py -c -m deemingomp periodogram.f90 --f90flags="-fopenmp " -lgomp
#
# Cython version of the Deeming periodogram function.
# See deeming.py
#
import numpy as np
cimport numpy as np
cimport cython
cdef extern from 'math.h':
double sin(double theta)
double cos(double theta)
DTYPE = np.float
ctypedef np.float_t DTYPE_t
@cython.cdivision(True)
@cython.boundscheck(False)
def periodogram(np.ndarray[DTYPE_t, ndim=1] t not None, \
np.ndarray[DTYPE_t, ndim=1] m not None,\
np.ndarray[DTYPE_t, ndim=1] freqs not None):
cdef DTYPE_t realpart = 0.0
cdef DTYPE_t imagpart = 0.0
cdef DTYPE_t pi = np.pi
cdef int Nf = freqs.shape[0]
cdef int Nt = t.shape[0]
cdef np.ndarray[DTYPE_t, ndim=1] amps = np.zeros([Nf], dtype=DTYPE)
cdef int i, j
for i in range(Nf):
for j in range(Nt):
realpart = realpart + m[j]*cos(2.0*pi*freqs[i]*t[j])
imagpart = imagpart + m[j]*sin(2.0*pi*freqs[i]*t[j])
amps[i] = 2.0*(realpart**2 + imagpart**2)**0.5/Nt
realpart = 0.0
imagpart = 0.0
return amps
#!/usr/bin/env python
'''
Testing the efficiency of different implimentations of the Deeming
periodogram. Its an O(N*N) algorithm for calculating a periodogram but
it works for unevenly sampled data. Used frequently in astronomical
time-series.
cyperiodogram contains a cython implimentation
periodogram.f90 contains two Fortran 90 implimentations
1 : Using the same array syntax as numpy
2 : Hand coded loops (Much faster in this case)
The fortran subroutines use openmp and are wrapped using f2py. Look in
build_deeming.sh for the compilation commands.
The OpenCL version uses deeming_kernel.cl for computations on the opencl device
(A GPU in my case)
To run this example (in Linux):
sh build_deeming.sh
python deeming.py
The cython extension will be compiled when running the deeming.py file.
'''
import numpy as np
import deemingomp
import time
import pyximport; pyximport.install()
import cyperiodogram as cy
import pyopencl as cl
import matplotlib.pyplot as plt
import matplotlib.style
matplotlib.style.use('ggplot')
def timeit(method):
'''
From: http://www.samuelbosch.com/2012/02/timing-functions-in-python.html
'''
def timed(*args, **kw):
ts = time.time()
result = method(*args, **kw)
te = time.time()
# print('%r %2.2f sec' % (method.__name__, te-ts))
return result, (te-ts)
return timed
# Deeming periodogram done with numpy
@timeit
def periodogram_numpy(t, m, freqs):
''' Calculate the Deeming periodogram using numpy
'''
pi = np.pi
amps = np.zeros(freqs.size, dtype='float')
cos = np.cos
sin = np.sin
for i, f in enumerate(freqs):
real = (m*cos(2*pi*f*t)).sum()
imag = (m*sin(2*pi*f*t)).sum()
amps[i] = real**2 + imag**2
amps = 2.0*np.sqrt(amps)/t.size
return amps
@timeit
def periodogram_cython(t, m, f):
''' Calculate Deeming periodogram using cython: see cyperiodogram.pyx
'''
cyamps = cy.periodogram(t, m, f)
return cyamps
@timeit
def periodogram_fortran_openmp(t, m, f, threads):
''' Calculate the Deeming periodogram using Fortran with OpenMP
'''
ampsf90omp_2 = deemingomp.periodogram2(t, m, f, t.size, f.size, threads)
return ampsf90omp_2
@timeit
def periodogram_opencl(t, m, f):
''' Calculate the Deeming periodogram using OpenCL on the GPU
'''
# create a context and a job queue
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
# create buffers to send to device
mf = cl.mem_flags
# input buffers
times_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=t)
mags_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=m)
freqs_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=f)
# output buffers
amps_buffer = cl.Buffer(ctx, mf.WRITE_ONLY, f.nbytes)
amps_g = np.empty_like(f)
# read and compile the opencl kernel
with open('deeming_kernel.cl') as kernel:
prg = cl.Program(ctx, kernel.read()).build()
try:
prg.build()
except:
print("Error:")
print(prg.get_build_info(ctx.devices[0],
cl.program_build_info.LOG))
raise
# call the function and copy the values from the buffer to a numpy array
prg.periodogram(queue, amps_g.shape, None,
times_g,
mags_g,
freqs_g,
amps_buffer,
np.int32(t.size))
cl.enqueue_copy(queue, amps_g, amps_buffer)
return amps_g
@timeit
def run_benchmark(N, M):
''' Run the Deeming benchmark using all the implementations using N data
points and M frequencies
'''
pi = np.pi
# Data
t = np.linspace(0.0, 0.25, N)
m = 1.75*np.sin(2*pi*150*t) + 0.75*np.sin(2*pi*277*t) + np.sin(2*pi*333*t)
# Frequencies
freqs = np.linspace(0.0, 1000.0, M)
# Run the different versions
_times = {}
# numpy
amps, t_numpy = periodogram_numpy(t, m, freqs)
# cython
cyamps, t_cython = periodogram_cython(t, m, freqs)
assert(np.allclose(cyamps, amps))
# Fortran with openmp
for threads in [1, 2, 4, 8]:
ampsf90omp_2, t_fortran = periodogram_fortran_openmp(t,
m,
freqs,
threads)
_times['fortran {}'.format(threads)] = t_fortran
assert(np.allclose(ampsf90omp_2, amps))
# opencl
opencl_amps, t_opencl = periodogram_opencl(t, m, freqs)
assert(np.allclose(opencl_amps, amps))
_times['numpy'] = t_numpy
_times['cython'] = t_cython
_times['opencl'] = t_opencl
return _times
def make_bar_plot(N, times_dict):
''' Make a bar plot using the timing dict obtained from run_benchmarks
'''
speedups = []
times = []
methods = ['numpy', 'cython', 'fortran 1', 'fortran 2', 'fortran 4',
'fortran 8', 'opencl']
for i, key in enumerate(methods):
speedups.append(times_dict['numpy']/times_dict[key])
times.append(times_dict[key])
index = np.arange(len(methods))
fig, ax = plt.subplots()
rect1 = ax.bar(index, speedups)
plt.xlabel('Method', fontsize=18)
plt.ylabel('Speedup (higher is better)', fontsize=18)
plt.xticks(index + 0.4, methods, fontsize=14)
def autolabel(rects, times):
# attach some text labels
for rect, _time in zip(rects, times):
height = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2., 1.02*height, '%1.1fs'% _time,
ha='center', va='bottom')
autolabel(rect1, times)
# save to file
plt.savefig('{}x{}-barchart.jpg'.format(N, N))
if __name__ == "__main__":
for N in [1000, 2000, 4000, 8000, 16000, 32000, 64000]:
_times, _time_all = run_benchmark(N, N)
make_bar_plot(N, _times)
print("\n{} datapoints, {} frequencies in {}s".format(N, N, round(_time_all, 1)))
for key in sorted(_times):
print("{}: {} {} {}".format(key,
" "*(20-len(key)),
round(_times[key], 3),
round(_times['numpy']/_times[key], 1)))
// Kernel to compute the deeming periodogram for a given frequency over a set
// of data
#define PYOPENCL_DEFINE_CDOUBLE
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void periodogram(
__global const double *times_g,
__global const double *mags_g,
__global const double *freqs_g,
__global double *amps_g,
const int datalength) {
int gid = get_global_id(0);
double this_frequency = freqs_g[gid];
double realpart = 0.0;
double imagpart = 0.0;
double pi = 3.141592653589793;
for (int i=0; i < datalength; i++){
realpart = realpart + mags_g[i]*cos(2.0*pi*this_frequency*times_g[i]);
imagpart = imagpart + mags_g[i]*sin(2.0*pi*this_frequency*times_g[i]);
}
amps_g[gid] = 2.0*sqrt(pow(realpart, 2) + pow(imagpart, 2))/datalength;
}
!
! See deeming.py for info. This file contains 2 Fortran implimentations
! of the deeming periodogram algorithm
subroutine periodogram(time, value, freqs, nt, nf , numprocs, amps)
! periodogram calculates the Deeming periodogram of the datapoints contained
! in the arrays time, value at the frequencies contained in freqs
! This subroutine is meant for f2py wrapping as well as a testbed of openMP
! within a python environment
use omp_lib
implicit none
! sizes of time and frequency arrays
integer :: nt, nf, id
! arrays
double precision time(nt), value(nt)
double precision freqs(nf), amps(nf)
integer :: numprocs, f
double precision :: pi, realpart, imagpart
pi = 3.1415926535897931D0
realpart = 0.0D0
imagpart = 0.0D0
! Add compiler directives so f2py knows what to do
!f2py intent(in) time
!f2py intent(in) value
!f2py intent(in) nt
!f2py depend(nt) time
!f2py depend(nt) value
!f2py intent(in) freqs
!f2py intent(in) nf
!f2py depend(nf) freqs
!f2py intent(in) numprocs
!f2py intent(out) amps
call OMP_SET_NUM_THREADS(numprocs)
!$OMP PARALLEL DO &
!$OMP DEFAULT(SHARED) PRIVATE(f,realpart, imagpart)
do f = 1, nf
! calculate real and imaginary parts
realpart = sum((value*dcos(2.0D0*pi*freqs(f)*time)))
imagpart = sum((value*dsin(2.0D0*pi*freqs(f)*time)))
! calculate the amplitude
amps(f) = 2.0D0*dsqrt(realpart**2 + imagpart**2)/nt
end do
!$OMP END PARALLEL DO
end subroutine
subroutine periodogram2(time, value, freqs, nt, nf , numprocs, amps)
! periodogram calculates the Deeming periodogram of the datapoints contained
! in the arrays time, value at the frequencies contained in freqs
! This subroutine is meant for f2py wrapping as well as a testbed of openMP
! within a python environment
use omp_lib
implicit none
! sizes of time and frequency arrays
integer :: nt, nf, id
! arrays
double precision time(nt), value(nt)
double precision freqs(nf), amps(nf)
integer :: numprocs, f, d
double precision :: pi, realpart, imagpart
pi = 3.1415926535897931D0
realpart = 0.0D0
imagpart = 0.0D0
! Add compiler directives so f2py knows what to do
!f2py threadsafe
!f2py intent(in) time
!f2py intent(in) value
!f2py intent(in) nt
!f2py depend(nt) time
!f2py depend(nt) value
!f2py intent(in) freqs
!f2py intent(in) nf
!f2py depend(nf) freqs
!f2py intent(in) numprocs
!f2py intent(out) amps
call OMP_SET_NUM_THREADS(numprocs)
!$OMP PARALLEL DO &
!$OMP DEFAULT(SHARED) PRIVATE(f,realpart, imagpart)
do f = 1, nf
! calculate real and imaginary parts
do d = 1, nt
realpart = realpart + value(d)*dcos(2.0D0*pi*freqs(f)*time(d))
imagpart = imagpart + value(d)*dsin(2.0D0*pi*freqs(f)*time(d))
end do
! calculate the amplitude
amps(f) = 2.0D0*dsqrt(realpart*realpart + imagpart*imagpart)/nt
realpart = 0.0D0
imagpart = 0.0D0
end do
!$OMP END PARALLEL DO
end subroutine
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment