Fast FFT-based image convolution support for large kernels

2013-10-25 22:21:53 +02:00 · 2013-10-25 22:21:53 +02:00 · 257526f6b2
parent dc4ea75259
commit 257526f6b2
10 changed files with 216 additions and 3 deletions
--- a/build/SConscript.configure
+++ b/build/SConscript.configure
@ -71,6 +71,9 @@ vars.Add('JPEGLIBDIR',      'libjpeg library path')
 vars.Add('COLLADAINCLUDE',  'COLLADA DOM include path')
 vars.Add('COLLADALIB',      'COLLADA DOM libraries')
 vars.Add('COLLADALIBDIR',   'COLLADA DOM library path')
 vars.Add('FFTWINCLUDE',     'FFTW include path')
 vars.Add('FFTWLIB',         'FFTW libraries')
 vars.Add('FFTWLIBDIR',      'FFTW library path')
 vars.Add('SHLIBPREFIX',     'Prefix for shared libraries')
 vars.Add('SHLIBSUFFIX',     'Suffix for shared libraries')
 vars.Add('LIBPREFIX',       'Prefix for windows library files')
@ -163,6 +166,10 @@ if env.has_key('COLLADAINCLUDE'):
 	env.Prepend(CPPPATH=env['COLLADAINCLUDE'])
 if env.has_key('COLLADALIBDIR'):
 	env.Prepend(LIBPATH=env['COLLADALIBDIR'])
 if env.has_key('FFTWINCLUDE'):
 	env.Prepend(CPPPATH=env['FFTWINCLUDE'])
 if env.has_key('FFTWLIBDIR'):
 	env.Prepend(LIBPATH=env['FFTWLIBDIR'])
 if not conf.CheckCXX():
 	print 'Could not compile a simple C++ fragment, verify that ' + \
@ -216,6 +223,10 @@ if not conf.TryCompile("#include <boost/version.hpp>\n#if BOOST_VERSION < 104400
 if not conf.CheckCXXHeader('Eigen/Core'):
 	print 'Eigen 3.x is missing (install libeigen3-dev)!'
 	Exit(1)
 if not conf.CheckCXXHeader('fftw3.h'):
    print 'FFTW3 not found (install for fast image convolution support)'
 else:
 	env.Append(CPPDEFINES = [['MTS_HAS_FFTW', 1]] )
 if sys.platform == 'win32':
 	if not (conf.CheckCHeader(['windows.h', 'GL/gl.h']) \
 			and conf.CheckCHeader(['windows.h', 'GL/glu.h']) \
--- a/build/config-linux-gcc-debug.py
+++ b/build/config-linux-gcc-debug.py
@ -21,6 +21,7 @@ GLFLAGS        = ['-DGLEW_MX']
 BOOSTLIB       = ['boost_system', 'boost_filesystem', 'boost_thread']
 COLLADAINCLUDE = ['/usr/include/collada-dom', '/usr/include/collada-dom/1.4']
 COLLADALIB     = ['collada14dom', 'xml2']
 FFTWLIB        = ['fftw3']
 # The following assumes that the Mitsuba bindings should be built for the
 # "default" Python version. It is also possible to build bindings for multiple
--- a/build/config-linux-gcc.py
+++ b/build/config-linux-gcc.py
@ -21,6 +21,7 @@ GLFLAGS        = ['-DGLEW_MX']
 BOOSTLIB       = ['boost_system', 'boost_filesystem', 'boost_thread']
 COLLADAINCLUDE = ['/usr/include/collada-dom', '/usr/include/collada-dom/1.4']
 COLLADALIB     = ['collada14dom', 'xml2']
 FFTWLIB        = ['fftw3']
 # The following assumes that the Mitsuba bindings should be built for the
 # "default" Python version. It is also possible to build bindings for multiple
--- a/build/config-linux-icl.py
+++ b/build/config-linux-icl.py
@ -21,6 +21,7 @@ GLFLAGS        = ['-DGLEW_MX']
 BOOSTLIB       = ['boost_system', 'boost_filesystem', 'boost_thread']
 COLLADAINCLUDE = ['/usr/include/collada-dom', '/usr/include/collada-dom/1.4']
 COLLADALIB     = ['collada14dom', 'xml2']
 FFTWLIB        = ['fftw3']
 # The following assumes that the Mitsuba bindings should be built for the
 # "default" Python version. It is also possible to build bindings for multiple
--- a/include/mitsuba/core/bitmap.h
+++ b/include/mitsuba/core/bitmap.h
@ -784,6 +784,24 @@ public:
 		accumulate(bitmap, Point2i(0), targetOffset, bitmap->getSize());
 	}
 	/**
 	 * \brief Convolve the image with a (centered) convolution kernel
 	 *
 	 * When compiled with FFTW, Mitsuba will do the convolution
 	 * in frequency space using three FFT operations. Otherwise,
 	 * it falls back to a brute force method with quadratic
 	 * complexity.
 	 *
 	 * The image can have any resolution; the kernel should be
 	 * square and of odd resolution. Both images must be \ref EFloat32
 	 * or \ref EFloat64 valued and of the same pixel format. Each
 	 * channel is processed separately.
 	 *
 	 * The convolution is always performed in double precision.
 	 * irrespective of the precision of the underlying data.
 	 */
 	void convolve(const Bitmap *kernel);
 	/**
 	 * \brief Accumulate the contents of another bitmap into the
 	 * region of the specified offset
--- a/include/mitsuba/core/util.h
+++ b/include/mitsuba/core/util.h
@ -274,11 +274,26 @@ extern MTS_EXPORT_CORE Float hypot2(Float a, Float b);
 extern MTS_EXPORT_CORE Float log2(Float value);
 /// Always-positive modulo function (assumes b > 0)
-inline int modulo(int a, int b) {
+inline int32_t modulo(int32_t a, int32_t b) {
-	int r = a % b;
+	int32_t r = a % b;
 	return (r < 0) ? r+b : r;
 }
 /// Always-positive modulo function (assumes b > 0)
 inline int64_t modulo(int64_t a, int64_t b) {
 	int64_t r = a % b;
 	return (r < 0) ? r+b : r;
 }
 #if defined(MTS_AMBIGUOUS_SIZE_T)
 inline ssize_t modulo(ssize_t a, ssize_t b) {
 	if (sizeof(ssize_t) == 8)
 		return modulo((int64_t) a, (int64_t) b);
 	else
 		return modulo((int32_t) a, (int32_t) b);
 }
 #endif
 /// Always-positive modulo function, float version (assumes b > 0)
 inline Float modulo(Float a, Float b) {
 	Float r = std::fmod(a, b);
--- a/src/libcore/SConscript
+++ b/src/libcore/SConscript
@ -22,6 +22,12 @@ if coreEnv.has_key('JPEGINCLUDE'):
 	coreEnv.Prepend(CPPPATH=env['JPEGINCLUDE'])
 if coreEnv.has_key('JPEGLIB'):
 	coreEnv.Prepend(LIBS=env['JPEGLIB'])
 if coreEnv.has_key('FFTWLIBDIR'):
 	coreEnv.Prepend(LIBPATH=env['FFTWLIBDIR'])
 if coreEnv.has_key('FFTWINCLUDE'):
 	coreEnv.Prepend(CPPPATH=env['FFTWINCLUDE'])
 if coreEnv.has_key('FFTWLIB'):
 	coreEnv.Prepend(LIBS=env['FFTWLIB'])
 coreEnv.Prepend(CPPDEFINES = [['MTS_BUILD_MODULE', 'MTS_MODULE_CORE']])
--- a/src/libcore/bitmap.cpp
+++ b/src/libcore/bitmap.cpp
@ -59,6 +59,11 @@ extern "C" {
 };
 #endif
 #if defined(MTS_HAS_FFTW)
 #include <complex>
 #include <fftw3.h>
 #endif
 MTS_NAMESPACE_BEGIN
 #if defined(MTS_HAS_OPENEXR)
@ -558,6 +563,156 @@ void Bitmap::accumulate(const Bitmap *bitmap, Point2i sourceOffset,
 	}
 }
 void Bitmap::convolve(const Bitmap *_kernel) {
 	if (_kernel->getWidth() != _kernel->getHeight())
 		Log(EError, "Bitmap::convolve(): convolution kernel must be square!");
 	if (_kernel->getWidth() % 2 != 1)
 		Log(EError, "Bitmap::convolve(): convolution kernel size must be odd!");
 	if (_kernel->getChannelCount() != getChannelCount())
 		Log(EError, "Bitmap::convolve(): kernel and bitmap have different channel counts!");
 	if (_kernel->getPixelFormat() != getPixelFormat())
 		Log(EError, "Bitmap::convolve(): kernel and bitmap have different pixel formats!");
 	if (_kernel->getComponentFormat() != getComponentFormat())
 		Log(EError, "Bitmap::convolve(): kernel and bitmap have different component formats!");
 	if (m_componentFormat != EFloat32 && m_componentFormat != EFloat64)
 		Log(EError, "Bitmap::convolve(): unsupported component format! (must be float32/float64)");
 	size_t kernelSize   = (size_t) _kernel->getWidth(),
 		   hKernelSize  = kernelSize / 2,
 		   width        = (size_t) m_size.x,
 		   height       = (size_t) m_size.y;
 #if defined(MTS_HAS_FFTW)
 	typedef std::complex<double> complex;
 	size_t paddedWidth  = width + hKernelSize,
 		   paddedHeight = height + hKernelSize,
 		   paddedSize   = paddedWidth*paddedHeight;
 	complex *kernel  = (complex *) fftw_malloc(sizeof(complex) * paddedSize),
 	        *kernelS = (complex *) fftw_malloc(sizeof(complex) * paddedSize),
 	        *data    = (complex *) fftw_malloc(sizeof(complex) * paddedSize),
 	        *dataS   = (complex *) fftw_malloc(sizeof(complex) * paddedSize);
 	if (!kernel || !kernelS || !data || !dataS)
 		SLog(EError, "Bitmap::convolve(): Unable to allocate temporary memory!");
 	/* Create a FFTW plan for a 2D DFT of this size */
 	fftw_plan p = fftw_plan_dft_2d(paddedHeight, paddedWidth,
 		(fftw_complex *) kernel, (fftw_complex *) kernelS, FFTW_FORWARD, FFTW_ESTIMATE);
 	memset(kernel, 0, sizeof(complex)*paddedSize);
 	for (int ch=0; ch<m_channelCount; ++ch) {
 		memset(data, 0, sizeof(complex)*paddedSize);
 		if (m_componentFormat == EFloat32) {
 			/* Copy and zero-pad the convolution kernel in a wraparound fashion */
 			for (size_t y=0; y<kernelSize; ++y) {
 				ssize_t wrappedY = modulo(hKernelSize - (ssize_t) y, (ssize_t) paddedHeight);
 				for (size_t x=0; x<kernelSize; ++x) {
 					ssize_t wrappedX = modulo(hKernelSize - (ssize_t) x, (ssize_t) paddedWidth);
 					kernel[wrappedX+wrappedY*paddedWidth] = _kernel->getFloat32Data()[(x+y*kernelSize)*m_channelCount+ch];
 				}
 			}
 			/* Copy and zero-pad the input data */
 			for (size_t y=0; y<height; ++y)
 				for (size_t x=0; x<width; ++x)
 					data[x+y*paddedWidth] = getFloat32Data()[(x+y*width)*m_channelCount + ch];
 		} else {
 			/* Copy and zero-pad the convolution kernel in a wraparound fashion */
 			for (size_t y=0; y<kernelSize; ++y) {
 				ssize_t wrappedY = modulo(hKernelSize - (ssize_t) y, (ssize_t) paddedHeight);
 				for (size_t x=0; x<kernelSize; ++x) {
 					ssize_t wrappedX = modulo(hKernelSize - (ssize_t) x, (ssize_t) paddedWidth);
 					kernel[wrappedX+wrappedY*paddedWidth] = _kernel->getFloat64Data()[(x+y*kernelSize)*m_channelCount+ch];
 				}
 			}
 			/* Copy and zero-pad the input data */
 			for (size_t y=0; y<height; ++y)
 				for (size_t x=0; x<width; ++x)
 					data[x+y*paddedWidth] = getFloat64Data()[(x+y*width)*m_channelCount + ch];
 		}
 		/* FFT the kernel and data */
 		fftw_execute(p);
 		fftw_execute_dft(p, (fftw_complex *) data, (fftw_complex *) dataS);
 		/* Multiply in frequency space -- also conjugate and scale to use the computed FFT plan backwards */
 		double factor = (double) 1 / (paddedWidth*paddedHeight);
 		for (size_t i=0; i<paddedSize; ++i)
 			dataS[i] = factor * std::conj(dataS[i] * kernelS[i]);
 		/* "IFFT" */
 		fftw_execute_dft(p, (fftw_complex *) dataS, (fftw_complex *) data);
 		if (m_componentFormat == EFloat32) {
 			for (size_t y=0; y<height; ++y)
 				for (size_t x=0; x<width; ++x)
 					getFloat32Data()[(x+y*width)*m_channelCount+ch] = (float) std::real(data[x+y*paddedWidth]);
 		} else {
 			for (size_t y=0; y<height; ++y)
 				for (size_t x=0; x<width; ++x)
 					getFloat64Data()[(x+y*width)*m_channelCount+ch] = std::real(data[x+y*paddedWidth]);
 		}
 	}
 	fftw_destroy_plan(p);
 	fftw_free(kernel);
 	fftw_free(kernelS);
 	fftw_free(data);
 	fftw_free(dataS);
 #else
 	/* Brute force fallback version */
 	float *output = static_cast<float *>(allocAligned(getBufferSize()));
 	for (int ch=0; ch<m_channelCount; ++ch) {
 		if (m_componentFormat == EFloat32) {
 			const float *input = getFloat32Data();
 			const float *kernel = _kernel->getFloat32Data();
 			for (size_t y=0; y<height; ++y) {
 				for (size_t x=0; x<width; ++x) {
 					double result = 0;
 					for (size_t ky=0; ky<kernelSize; ++ky) {
 						for (size_t kx=0; kx<kernelSize; ++kx) {
 							ssize_t xs = x + kx - (ssize_t) hKernelSize,
 									ys = y + ky - (ssize_t) hKernelSize;
 							if (xs >= 0 && ys >= 0 && xs < (ssize_t) width && ys < (ssize_t) height)
 								result += kernel[(kx+ky*kernelSize)*m_channelCount+ch]
 									* input[(xs+ys*width)*m_channelCount+ch];
 						}
 					}
 					output[(x+y*width)*m_channelCount+ch] = (float) result;
 				}
 			}
 		} else {
 			const double *input = getFloat64Data();
 			const double *kernel = _kernel->getFloat64Data();
 			for (size_t y=0; y<height; ++y) {
 				for (size_t x=0; x<width; ++x) {
 					double result = 0;
 					for (size_t ky=0; ky<kernelSize; ++ky) {
 						for (size_t kx=0; kx<kernelSize; ++kx) {
 							ssize_t xs = x + kx - (ssize_t) hKernelSize,
 									ys = y + ky - (ssize_t) hKernelSize;
 							if (xs >= 0 && ys >= 0 && xs < (ssize_t) width && ys < (ssize_t) height)
 								result += kernel[(kx+ky*kernelSize)*m_channelCount+ch]
 									* input[(xs+ys*width)*m_channelCount+ch];
 						}
 					}
 					output[(x+y*width)*m_channelCount+ch] = result;
 				}
 			}
 		}
 	}
 	if (m_ownsData)
 		freeAligned(m_data);
 	m_data = (uint8_t *) output;
 	m_ownsData = true;
 #endif
 }
 void Bitmap::scale(Float value) {
 	if (m_componentFormat == EBitmask)
 		Log(EError, "Bitmap::scale(): bitmasks are not supported!");
@ -801,7 +956,6 @@ ref<Bitmap> Bitmap::arithmeticOperation(Bitmap::EArithmeticOperation operation,
 	return output;
 }
 void Bitmap::colorBalance(Float r, Float g, Float b) {
 	if (m_pixelFormat != ERGB && m_pixelFormat != ERGBA)
 		Log(EError, "colorBalance(): expected a RGB or RGBA image!");
--- a/src/libpython/core.cpp
+++ b/src/libpython/core.cpp
@ -756,8 +756,10 @@ void export_core() {
 		.def("crop", &Bitmap::crop)
 		.def("applyMatrix", &bitmap_applyMatrix)
 		.def("colorBalance", &Bitmap::colorBalance)
 		.def("convolve", &Bitmap::convolve)
 		.def("accumulate", accumulate_1)
 		.def("accumulate", accumulate_2)
 		.def("scale", &Bitmap::scale)
 		.def("write", &bitmap_write)
 		.def("setMetadataString", &Bitmap::setMetadataString)
 		.def("getMetadataString", &Bitmap::getMetadataString, BP_RETURN_VALUE)
--- a/src/mtsgui/aboutdlg.cpp
+++ b/src/mtsgui/aboutdlg.cpp
@ -78,6 +78,10 @@ AboutDialog::AboutDialog(QWidget *parent) :
 	configFlags += "MTS_HAS_BREAKPAD ";
 #endif
 #if defined(MTS_HAS_FFTW)
 	configFlags += "MTS_HAS_FFTW ";
 #endif
 	configFlags += formatString("SPECTRUM_SAMPLES=%i ",
 		SPECTRUM_SAMPLES).c_str();