From 257526f6b27c7ac13294efc8d8b62c43c52d224c Mon Sep 17 00:00:00 2001
From: Wenzel Jakob <wenzel@cs.cornell.edu>
Date: Fri, 25 Oct 2013 22:21:53 +0200
Subject: [PATCH] Fast FFT-based image convolution support for large kernels

---
 build/SConscript.configure      |  11 +++
 build/config-linux-gcc-debug.py |   1 +
 build/config-linux-gcc.py       |   1 +
 build/config-linux-icl.py       |   1 +
 include/mitsuba/core/bitmap.h   |  18 ++++
 include/mitsuba/core/util.h     |  19 +++-
 src/libcore/SConscript          |   6 ++
 src/libcore/bitmap.cpp          | 156 +++++++++++++++++++++++++++++++-
 src/libpython/core.cpp          |   2 +
 src/mtsgui/aboutdlg.cpp         |   4 +
 10 files changed, 216 insertions(+), 3 deletions(-)

diff --git a/build/SConscript.configure b/build/SConscript.configure
index 9d4c1517..17e0dd1b 100644
--- a/build/SConscript.configure
+++ b/build/SConscript.configure
@@ -71,6 +71,9 @@ vars.Add('JPEGLIBDIR',      'libjpeg library path')
 vars.Add('COLLADAINCLUDE',  'COLLADA DOM include path')
 vars.Add('COLLADALIB',      'COLLADA DOM libraries')
 vars.Add('COLLADALIBDIR',   'COLLADA DOM library path')
+vars.Add('FFTWINCLUDE',     'FFTW include path')
+vars.Add('FFTWLIB',         'FFTW libraries')
+vars.Add('FFTWLIBDIR',      'FFTW library path')
 vars.Add('SHLIBPREFIX',     'Prefix for shared libraries')
 vars.Add('SHLIBSUFFIX',     'Suffix for shared libraries')
 vars.Add('LIBPREFIX',       'Prefix for windows library files')
@@ -163,6 +166,10 @@ if env.has_key('COLLADAINCLUDE'):
 	env.Prepend(CPPPATH=env['COLLADAINCLUDE'])
 if env.has_key('COLLADALIBDIR'):
 	env.Prepend(LIBPATH=env['COLLADALIBDIR'])
+if env.has_key('FFTWINCLUDE'):
+	env.Prepend(CPPPATH=env['FFTWINCLUDE'])
+if env.has_key('FFTWLIBDIR'):
+	env.Prepend(LIBPATH=env['FFTWLIBDIR'])
 
 if not conf.CheckCXX():
 	print 'Could not compile a simple C++ fragment, verify that ' + \
@@ -216,6 +223,10 @@ if not conf.TryCompile("#include <boost/version.hpp>\n#if BOOST_VERSION < 104400
 if not conf.CheckCXXHeader('Eigen/Core'):
 	print 'Eigen 3.x is missing (install libeigen3-dev)!'
 	Exit(1)
+if not conf.CheckCXXHeader('fftw3.h'):
+    print 'FFTW3 not found (install for fast image convolution support)'
+else:
+	env.Append(CPPDEFINES = [['MTS_HAS_FFTW', 1]] )
 if sys.platform == 'win32':
 	if not (conf.CheckCHeader(['windows.h', 'GL/gl.h']) \
 			and conf.CheckCHeader(['windows.h', 'GL/glu.h']) \
diff --git a/build/config-linux-gcc-debug.py b/build/config-linux-gcc-debug.py
index cd6ce15d..f9bc71ac 100644
--- a/build/config-linux-gcc-debug.py
+++ b/build/config-linux-gcc-debug.py
@@ -21,6 +21,7 @@ GLFLAGS        = ['-DGLEW_MX']
 BOOSTLIB       = ['boost_system', 'boost_filesystem', 'boost_thread']
 COLLADAINCLUDE = ['/usr/include/collada-dom', '/usr/include/collada-dom/1.4']
 COLLADALIB     = ['collada14dom', 'xml2']
+FFTWLIB        = ['fftw3']
 
 # The following assumes that the Mitsuba bindings should be built for the
 # "default" Python version. It is also possible to build bindings for multiple
diff --git a/build/config-linux-gcc.py b/build/config-linux-gcc.py
index 0567f27c..5630597c 100644
--- a/build/config-linux-gcc.py
+++ b/build/config-linux-gcc.py
@@ -21,6 +21,7 @@ GLFLAGS        = ['-DGLEW_MX']
 BOOSTLIB       = ['boost_system', 'boost_filesystem', 'boost_thread']
 COLLADAINCLUDE = ['/usr/include/collada-dom', '/usr/include/collada-dom/1.4']
 COLLADALIB     = ['collada14dom', 'xml2']
+FFTWLIB        = ['fftw3']
 
 # The following assumes that the Mitsuba bindings should be built for the
 # "default" Python version. It is also possible to build bindings for multiple
diff --git a/build/config-linux-icl.py b/build/config-linux-icl.py
index f1b12fc2..b93b7377 100644
--- a/build/config-linux-icl.py
+++ b/build/config-linux-icl.py
@@ -21,6 +21,7 @@ GLFLAGS        = ['-DGLEW_MX']
 BOOSTLIB       = ['boost_system', 'boost_filesystem', 'boost_thread']
 COLLADAINCLUDE = ['/usr/include/collada-dom', '/usr/include/collada-dom/1.4']
 COLLADALIB     = ['collada14dom', 'xml2']
+FFTWLIB        = ['fftw3']
 
 # The following assumes that the Mitsuba bindings should be built for the
 # "default" Python version. It is also possible to build bindings for multiple
diff --git a/include/mitsuba/core/bitmap.h b/include/mitsuba/core/bitmap.h
index 6a9fc20e..81ce9135 100644
--- a/include/mitsuba/core/bitmap.h
+++ b/include/mitsuba/core/bitmap.h
@@ -784,6 +784,24 @@ public:
 		accumulate(bitmap, Point2i(0), targetOffset, bitmap->getSize());
 	}
 
+	/**
+	 * \brief Convolve the image with a (centered) convolution kernel
+	 *
+	 * When compiled with FFTW, Mitsuba will do the convolution
+	 * in frequency space using three FFT operations. Otherwise,
+	 * it falls back to a brute force method with quadratic
+	 * complexity.
+	 *
+	 * The image can have any resolution; the kernel should be
+	 * square and of odd resolution. Both images must be \ref EFloat32
+	 * or \ref EFloat64 valued and of the same pixel format. Each
+	 * channel is processed separately.
+	 *
+	 * The convolution is always performed in double precision.
+	 * irrespective of the precision of the underlying data.
+	 */
+	void convolve(const Bitmap *kernel);
+
 	/**
 	 * \brief Accumulate the contents of another bitmap into the
 	 * region of the specified offset
diff --git a/include/mitsuba/core/util.h b/include/mitsuba/core/util.h
index 147eb51e..c1222b6b 100644
--- a/include/mitsuba/core/util.h
+++ b/include/mitsuba/core/util.h
@@ -274,11 +274,26 @@ extern MTS_EXPORT_CORE Float hypot2(Float a, Float b);
 extern MTS_EXPORT_CORE Float log2(Float value);
 
 /// Always-positive modulo function (assumes b > 0)
-inline int modulo(int a, int b) {
-	int r = a % b;
+inline int32_t modulo(int32_t a, int32_t b) {
+	int32_t r = a % b;
 	return (r < 0) ? r+b : r;
 }
 
+/// Always-positive modulo function (assumes b > 0)
+inline int64_t modulo(int64_t a, int64_t b) {
+	int64_t r = a % b;
+	return (r < 0) ? r+b : r;
+}
+
+#if defined(MTS_AMBIGUOUS_SIZE_T)
+inline ssize_t modulo(ssize_t a, ssize_t b) {
+	if (sizeof(ssize_t) == 8)
+		return modulo((int64_t) a, (int64_t) b);
+	else
+		return modulo((int32_t) a, (int32_t) b);
+}
+#endif
+
 /// Always-positive modulo function, float version (assumes b > 0)
 inline Float modulo(Float a, Float b) {
 	Float r = std::fmod(a, b);
diff --git a/src/libcore/SConscript b/src/libcore/SConscript
index 57e69154..7559d368 100644
--- a/src/libcore/SConscript
+++ b/src/libcore/SConscript
@@ -22,6 +22,12 @@ if coreEnv.has_key('JPEGINCLUDE'):
 	coreEnv.Prepend(CPPPATH=env['JPEGINCLUDE'])
 if coreEnv.has_key('JPEGLIB'):
 	coreEnv.Prepend(LIBS=env['JPEGLIB'])
+if coreEnv.has_key('FFTWLIBDIR'):
+	coreEnv.Prepend(LIBPATH=env['FFTWLIBDIR'])
+if coreEnv.has_key('FFTWINCLUDE'):
+	coreEnv.Prepend(CPPPATH=env['FFTWINCLUDE'])
+if coreEnv.has_key('FFTWLIB'):
+	coreEnv.Prepend(LIBS=env['FFTWLIB'])
 
 coreEnv.Prepend(CPPDEFINES = [['MTS_BUILD_MODULE', 'MTS_MODULE_CORE']])
 
diff --git a/src/libcore/bitmap.cpp b/src/libcore/bitmap.cpp
index 6e5606d1..9849d15d 100644
--- a/src/libcore/bitmap.cpp
+++ b/src/libcore/bitmap.cpp
@@ -59,6 +59,11 @@ extern "C" {
 };
 #endif
 
+#if defined(MTS_HAS_FFTW)
+#include <complex>
+#include <fftw3.h>
+#endif
+
 MTS_NAMESPACE_BEGIN
 
 #if defined(MTS_HAS_OPENEXR)
@@ -558,6 +563,156 @@ void Bitmap::accumulate(const Bitmap *bitmap, Point2i sourceOffset,
 	}
 }
 
+void Bitmap::convolve(const Bitmap *_kernel) {
+	if (_kernel->getWidth() != _kernel->getHeight())
+		Log(EError, "Bitmap::convolve(): convolution kernel must be square!");
+	if (_kernel->getWidth() % 2 != 1)
+		Log(EError, "Bitmap::convolve(): convolution kernel size must be odd!");
+	if (_kernel->getChannelCount() != getChannelCount())
+		Log(EError, "Bitmap::convolve(): kernel and bitmap have different channel counts!");
+	if (_kernel->getPixelFormat() != getPixelFormat())
+		Log(EError, "Bitmap::convolve(): kernel and bitmap have different pixel formats!");
+	if (_kernel->getComponentFormat() != getComponentFormat())
+		Log(EError, "Bitmap::convolve(): kernel and bitmap have different component formats!");
+	if (m_componentFormat != EFloat32 && m_componentFormat != EFloat64)
+		Log(EError, "Bitmap::convolve(): unsupported component format! (must be float32/float64)");
+
+	size_t kernelSize   = (size_t) _kernel->getWidth(),
+		   hKernelSize  = kernelSize / 2,
+		   width        = (size_t) m_size.x,
+		   height       = (size_t) m_size.y;
+
+#if defined(MTS_HAS_FFTW)
+	typedef std::complex<double> complex;
+
+	size_t paddedWidth  = width + hKernelSize,
+		   paddedHeight = height + hKernelSize,
+		   paddedSize   = paddedWidth*paddedHeight;
+
+	complex *kernel  = (complex *) fftw_malloc(sizeof(complex) * paddedSize),
+	        *kernelS = (complex *) fftw_malloc(sizeof(complex) * paddedSize),
+	        *data    = (complex *) fftw_malloc(sizeof(complex) * paddedSize),
+	        *dataS   = (complex *) fftw_malloc(sizeof(complex) * paddedSize);
+
+	if (!kernel || !kernelS || !data || !dataS)
+		SLog(EError, "Bitmap::convolve(): Unable to allocate temporary memory!");
+
+	/* Create a FFTW plan for a 2D DFT of this size */
+	fftw_plan p = fftw_plan_dft_2d(paddedHeight, paddedWidth,
+		(fftw_complex *) kernel, (fftw_complex *) kernelS, FFTW_FORWARD, FFTW_ESTIMATE);
+
+	memset(kernel, 0, sizeof(complex)*paddedSize);
+
+	for (int ch=0; ch<m_channelCount; ++ch) {
+		memset(data, 0, sizeof(complex)*paddedSize);
+		if (m_componentFormat == EFloat32) {
+			/* Copy and zero-pad the convolution kernel in a wraparound fashion */
+			for (size_t y=0; y<kernelSize; ++y) {
+				ssize_t wrappedY = modulo(hKernelSize - (ssize_t) y, (ssize_t) paddedHeight);
+				for (size_t x=0; x<kernelSize; ++x) {
+					ssize_t wrappedX = modulo(hKernelSize - (ssize_t) x, (ssize_t) paddedWidth);
+					kernel[wrappedX+wrappedY*paddedWidth] = _kernel->getFloat32Data()[(x+y*kernelSize)*m_channelCount+ch];
+				}
+			}
+
+			/* Copy and zero-pad the input data */
+			for (size_t y=0; y<height; ++y)
+				for (size_t x=0; x<width; ++x)
+					data[x+y*paddedWidth] = getFloat32Data()[(x+y*width)*m_channelCount + ch];
+		} else {
+			/* Copy and zero-pad the convolution kernel in a wraparound fashion */
+			for (size_t y=0; y<kernelSize; ++y) {
+				ssize_t wrappedY = modulo(hKernelSize - (ssize_t) y, (ssize_t) paddedHeight);
+				for (size_t x=0; x<kernelSize; ++x) {
+					ssize_t wrappedX = modulo(hKernelSize - (ssize_t) x, (ssize_t) paddedWidth);
+					kernel[wrappedX+wrappedY*paddedWidth] = _kernel->getFloat64Data()[(x+y*kernelSize)*m_channelCount+ch];
+				}
+			}
+
+			/* Copy and zero-pad the input data */
+			for (size_t y=0; y<height; ++y)
+				for (size_t x=0; x<width; ++x)
+					data[x+y*paddedWidth] = getFloat64Data()[(x+y*width)*m_channelCount + ch];
+		}
+
+		/* FFT the kernel and data */
+		fftw_execute(p);
+		fftw_execute_dft(p, (fftw_complex *) data, (fftw_complex *) dataS);
+
+		/* Multiply in frequency space -- also conjugate and scale to use the computed FFT plan backwards */
+		double factor = (double) 1 / (paddedWidth*paddedHeight);
+		for (size_t i=0; i<paddedSize; ++i)
+			dataS[i] = factor * std::conj(dataS[i] * kernelS[i]);
+
+		/* "IFFT" */
+		fftw_execute_dft(p, (fftw_complex *) dataS, (fftw_complex *) data);
+
+		if (m_componentFormat == EFloat32) {
+			for (size_t y=0; y<height; ++y)
+				for (size_t x=0; x<width; ++x)
+					getFloat32Data()[(x+y*width)*m_channelCount+ch] = (float) std::real(data[x+y*paddedWidth]);
+		} else {
+			for (size_t y=0; y<height; ++y)
+				for (size_t x=0; x<width; ++x)
+					getFloat64Data()[(x+y*width)*m_channelCount+ch] = std::real(data[x+y*paddedWidth]);
+		}
+	}
+	fftw_destroy_plan(p);
+	fftw_free(kernel);
+	fftw_free(kernelS);
+	fftw_free(data);
+	fftw_free(dataS);
+#else
+	/* Brute force fallback version */
+	float *output = static_cast<float *>(allocAligned(getBufferSize()));
+	for (int ch=0; ch<m_channelCount; ++ch) {
+		if (m_componentFormat == EFloat32) {
+			const float *input = getFloat32Data();
+			const float *kernel = _kernel->getFloat32Data();
+
+			for (size_t y=0; y<height; ++y) {
+				for (size_t x=0; x<width; ++x) {
+					double result = 0;
+					for (size_t ky=0; ky<kernelSize; ++ky) {
+						for (size_t kx=0; kx<kernelSize; ++kx) {
+							ssize_t xs = x + kx - (ssize_t) hKernelSize,
+									ys = y + ky - (ssize_t) hKernelSize;
+							if (xs >= 0 && ys >= 0 && xs < (ssize_t) width && ys < (ssize_t) height)
+								result += kernel[(kx+ky*kernelSize)*m_channelCount+ch]
+									* input[(xs+ys*width)*m_channelCount+ch];
+						}
+					}
+					output[(x+y*width)*m_channelCount+ch] = (float) result;
+				}
+			}
+		} else {
+			const double *input = getFloat64Data();
+			const double *kernel = _kernel->getFloat64Data();
+
+			for (size_t y=0; y<height; ++y) {
+				for (size_t x=0; x<width; ++x) {
+					double result = 0;
+					for (size_t ky=0; ky<kernelSize; ++ky) {
+						for (size_t kx=0; kx<kernelSize; ++kx) {
+							ssize_t xs = x + kx - (ssize_t) hKernelSize,
+									ys = y + ky - (ssize_t) hKernelSize;
+							if (xs >= 0 && ys >= 0 && xs < (ssize_t) width && ys < (ssize_t) height)
+								result += kernel[(kx+ky*kernelSize)*m_channelCount+ch]
+									* input[(xs+ys*width)*m_channelCount+ch];
+						}
+					}
+					output[(x+y*width)*m_channelCount+ch] = result;
+				}
+			}
+		}
+	}
+	if (m_ownsData)
+		freeAligned(m_data);
+	m_data = (uint8_t *) output;
+	m_ownsData = true;
+#endif
+}
+
 void Bitmap::scale(Float value) {
 	if (m_componentFormat == EBitmask)
 		Log(EError, "Bitmap::scale(): bitmasks are not supported!");
@@ -801,7 +956,6 @@ ref<Bitmap> Bitmap::arithmeticOperation(Bitmap::EArithmeticOperation operation,
 	return output;
 }
 
-
 void Bitmap::colorBalance(Float r, Float g, Float b) {
 	if (m_pixelFormat != ERGB && m_pixelFormat != ERGBA)
 		Log(EError, "colorBalance(): expected a RGB or RGBA image!");
diff --git a/src/libpython/core.cpp b/src/libpython/core.cpp
index 1d596b72..e75f5be8 100644
--- a/src/libpython/core.cpp
+++ b/src/libpython/core.cpp
@@ -756,8 +756,10 @@ void export_core() {
 		.def("crop", &Bitmap::crop)
 		.def("applyMatrix", &bitmap_applyMatrix)
 		.def("colorBalance", &Bitmap::colorBalance)
+		.def("convolve", &Bitmap::convolve)
 		.def("accumulate", accumulate_1)
 		.def("accumulate", accumulate_2)
+		.def("scale", &Bitmap::scale)
 		.def("write", &bitmap_write)
 		.def("setMetadataString", &Bitmap::setMetadataString)
 		.def("getMetadataString", &Bitmap::getMetadataString, BP_RETURN_VALUE)
diff --git a/src/mtsgui/aboutdlg.cpp b/src/mtsgui/aboutdlg.cpp
index aa541777..8ac72edc 100644
--- a/src/mtsgui/aboutdlg.cpp
+++ b/src/mtsgui/aboutdlg.cpp
@@ -78,6 +78,10 @@ AboutDialog::AboutDialog(QWidget *parent) :
 	configFlags += "MTS_HAS_BREAKPAD ";
 #endif
 
+#if defined(MTS_HAS_FFTW)
+	configFlags += "MTS_HAS_FFTW ";
+#endif
+
 	configFlags += formatString("SPECTRUM_SAMPLES=%i ",
 		SPECTRUM_SAMPLES).c_str();