From fbb4867f49835bca00b4d85491fbf650f5fdeb7e Mon Sep 17 00:00:00 2001
From: Wenzel Jakob <wenzel@inf.ethz.ch>
Date: Tue, 1 Jul 2014 22:35:30 +0200
Subject: [PATCH] Fixed core affinity handling, specified TLS dialect on Linux

- Fixed core affinity handling for Linux clusters with CPU quotas
  (e.g. the ETH Brutus cluster).

- Added an explicit -mtls-dialect flag to the Linux config.py files to
  prevent crashes when loading the Python plugin in certain settings
  (particularly: the ETH cluster).
---
 build/config-linux-gcc-debug.py |  2 +-
 build/config-linux-gcc.py       |  2 +-
 include/mitsuba/core/sched.h    | 15 ++++++++++
 src/libcore/sched.cpp           |  3 +-
 src/libcore/thread.cpp          | 53 +++++++++++++++++++++++++--------
 src/libcore/util.cpp            | 35 ++++++++++++++++++++--
 src/mitsuba/mitsuba.cpp         |  7 +++--
 src/mitsuba/mtsutil.cpp         |  7 +++--
 src/mtsgui/mainwindow.cpp       | 35 ++++++++++++++--------
 9 files changed, 124 insertions(+), 35 deletions(-)

diff --git a/build/config-linux-gcc-debug.py b/build/config-linux-gcc-debug.py
index ccd1d7e4..3ce1fbb0 100644
--- a/build/config-linux-gcc-debug.py
+++ b/build/config-linux-gcc-debug.py
@@ -4,7 +4,7 @@ BUILDDIR       = '#build/debug'
 DISTDIR        = '#dist'
 CXX            = 'g++'
 CC             = 'gcc'
-CXXFLAGS       = ['-O0', '-Wall', '-g', '-pipe', '-march=nocona', '-msse2', '-ftree-vectorize', '-mfpmath=sse', '-funsafe-math-optimizations', '-fno-rounding-math', '-fno-signaling-nans', '-fno-math-errno', '-fno-omit-frame-pointer', '-DMTS_DEBUG', '-DSINGLE_PRECISION', '-DSPECTRUM_SAMPLES=3', '-DMTS_SSE', '-DMTS_HAS_COHERENT_RT', '-fopenmp', '-fvisibility=hidden']
+CXXFLAGS       = ['-O0', '-Wall', '-g', '-pipe', '-march=nocona', '-msse2', '-ftree-vectorize', '-mfpmath=sse', '-funsafe-math-optimizations', '-fno-rounding-math', '-fno-signaling-nans', '-fno-math-errno', '-fno-omit-frame-pointer', '-DMTS_DEBUG', '-DSINGLE_PRECISION', '-DSPECTRUM_SAMPLES=3', '-DMTS_SSE', '-DMTS_HAS_COHERENT_RT', '-fopenmp', '-fvisibility=hidden', '-mtls-dialect=gnu2']
 LINKFLAGS      = []
 SHLINKFLAGS    = ['-rdynamic', '-shared', '-fPIC', '-lstdc++']
 BASEINCLUDE    = ['#include']
diff --git a/build/config-linux-gcc.py b/build/config-linux-gcc.py
index 5bc8b6ae..77c9b753 100644
--- a/build/config-linux-gcc.py
+++ b/build/config-linux-gcc.py
@@ -4,7 +4,7 @@ BUILDDIR       = '#build/release'
 DISTDIR        = '#dist'
 CXX            = 'g++'
 CC             = 'gcc'
-CXXFLAGS       = ['-O3', '-Wall', '-g', '-pipe', '-march=nocona', '-msse2', '-ftree-vectorize', '-mfpmath=sse', '-funsafe-math-optimizations', '-fno-rounding-math', '-fno-signaling-nans', '-fno-math-errno', '-fomit-frame-pointer', '-DMTS_DEBUG', '-DSINGLE_PRECISION', '-DSPECTRUM_SAMPLES=3', '-DMTS_SSE', '-DMTS_HAS_COHERENT_RT', '-fopenmp', '-fvisibility=hidden']
+CXXFLAGS       = ['-O3', '-Wall', '-g', '-pipe', '-march=nocona', '-msse2', '-ftree-vectorize', '-mfpmath=sse', '-funsafe-math-optimizations', '-fno-rounding-math', '-fno-signaling-nans', '-fno-math-errno', '-fomit-frame-pointer', '-DMTS_DEBUG', '-DSINGLE_PRECISION', '-DSPECTRUM_SAMPLES=3', '-DMTS_SSE', '-DMTS_HAS_COHERENT_RT', '-fopenmp', '-fvisibility=hidden', '-mtls-dialect=gnu2']
 LINKFLAGS      = []
 SHLINKFLAGS    = ['-rdynamic', '-shared', '-fPIC', '-lstdc++']
 BASEINCLUDE    = ['#include']
diff --git a/include/mitsuba/core/sched.h b/include/mitsuba/core/sched.h
index 8cc03bb0..d9daf0fc 100644
--- a/include/mitsuba/core/sched.h
+++ b/include/mitsuba/core/sched.h
@@ -758,6 +758,21 @@ protected:
  */
 class MTS_EXPORT_CORE LocalWorker : public Worker {
 public:
+	/**
+	 * \brief Create a new local worker thread
+	 *
+	 * \param coreID
+	 *   When an CPU core ID (>=0) is specified here, the worker
+	 *   thread will attempt to register core affinity with the
+	 *   operating system. Passing -1 disables this.
+	 *
+	 * \param name
+	 *   An identifying string for this thread
+	 *
+	 * \param priority
+	 *   The desired thread priority (not supported on some
+	 *   operating systems)
+	 */
 	LocalWorker(int coreID, const std::string &name,
 		Thread::EThreadPriority priority = Thread::ENormalPriority);
 
diff --git a/src/libcore/sched.cpp b/src/libcore/sched.cpp
index c95f2218..b6f97336 100644
--- a/src/libcore/sched.cpp
+++ b/src/libcore/sched.cpp
@@ -633,7 +633,8 @@ void Worker::start(Scheduler *scheduler, int workerIndex, int coreOffset) {
 
 LocalWorker::LocalWorker(int coreID, const std::string &name,
 		Thread::EThreadPriority priority) : Worker(name) {
-	setCoreAffinity(coreID);
+	if (coreID >= 0)
+		setCoreAffinity(coreID);
 	m_coreCount = 1;
 #if !defined(__LINUX__)
 	/* Don't set thead priority on Linux, since it uses
diff --git a/src/libcore/thread.cpp b/src/libcore/thread.cpp
index d0351bb8..621ea42b 100644
--- a/src/libcore/thread.cpp
+++ b/src/libcore/thread.cpp
@@ -324,24 +324,51 @@ void Thread::setCoreAffinity(int coreID) {
 	if (getenv("VALGRIND_OPTS") != NULL)
 		return;
 
-	int nCores = getCoreCount();
-	cpu_set_t *cpuset = CPU_ALLOC(nCores);
-	if (cpuset == NULL)
-		Log(EError, "Thread::setCoreAffinity(): could not allocate cpu_set_t");
-
+	int nCores = sysconf(_SC_NPROCESSORS_CONF);
 	size_t size = CPU_ALLOC_SIZE(nCores);
+	cpu_set_t *cpuset = CPU_ALLOC(nCores);
 	CPU_ZERO_S(size, cpuset);
-	if (coreID != -1 && coreID < nCores) {
-		CPU_SET_S(coreID, size, cpuset);
-	} else {
-		for (int i=0; i<nCores; ++i)
-			CPU_SET_S(i, size, cpuset);
+	if (cpuset == NULL) {
+		Log(EWarn, "Thread::setCoreAffinity(): could not allocate cpu_set_t");
+		return;
 	}
 
 	const pthread_t threadID = d->thread.native_handle();
-	int retval = pthread_setaffinity_np(threadID, size, cpuset);
-	if (retval) 
-		Log(EWarn, "Thread::setCoreAffinity(): pthread_setaffinity_np: failed: %s", strerror(errno));
+	int retval = pthread_getaffinity_np(threadID, size, cpuset);
+	if (retval) {
+		Log(EWarn, "Thread::setCoreAffinity(): pthread_getaffinity_np(): could "
+			"not read thread affinity map: %s", strerror(retval));
+		CPU_FREE(cpuset);
+		return;
+	}
+
+	int actualCoreID = -1, available = 0;
+	for (int i=0; i<nCores; ++i) {
+		if (!CPU_ISSET_S(i, size, cpuset))
+			continue;
+		if (available++ == coreID) {
+			actualCoreID = i; 
+			break;
+		}
+	}
+
+	if (actualCoreID == -1) {
+		Log(EWarn, "Thread::setCoreAffinity(): out of bounds: %i/%i cores available, requested #%i!",
+			available, nCores, coreID);
+		CPU_FREE(cpuset);
+		return;
+	}
+
+	CPU_ZERO_S(size, cpuset);
+	CPU_SET_S(actualCoreID, size, cpuset);
+
+	retval = pthread_setaffinity_np(threadID, size, cpuset);
+	if (retval) {
+		Log(EWarn, "Thread::setCoreAffinity(): pthread_setaffinity_np: failed: %s", strerror(retval));
+		CPU_FREE(cpuset);
+		return;
+	}
+
 	CPU_FREE(cpuset);
 #elif defined(__WINDOWS__)
 	int nCores = getCoreCount();
diff --git a/src/libcore/util.cpp b/src/libcore/util.cpp
index c6e3c405..e297fe6c 100644
--- a/src/libcore/util.cpp
+++ b/src/libcore/util.cpp
@@ -143,19 +143,50 @@ void freeAligned(void *ptr) {
 #endif
 }
 
+static int __cached_core_count = 0;
+
 int getCoreCount() {
+	// assumes atomic word size memory access
+	if (__cached_core_count)
+		return __cached_core_count;
+
 #if defined(__WINDOWS__)
 	SYSTEM_INFO sys_info;
 	GetSystemInfo(&sys_info);
+	__cached_core_count = sys_info.dwNumberOfProcessors;
 	return sys_info.dwNumberOfProcessors;
 #elif defined(__OSX__)
 	int nprocs;
 	size_t nprocsSize = sizeof(int);
 	if (sysctlbyname("hw.activecpu", &nprocs, &nprocsSize, NULL, 0))
 		SLog(EError, "Could not detect the number of processors!");
-	return (int) nprocs;
+	__cached_core_count = nprocs;
+	return nprocs;
 #else
-	return sysconf(_SC_NPROCESSORS_CONF);
+	/* Determine the number of present cores */
+	int nCores = sysconf(_SC_NPROCESSORS_CONF);
+
+	/* Some of the cores may not be available to the user
+	   (e.g. on certain cluster nodes) -- determine the number
+	   of actual available cores here. */
+	size_t size = CPU_ALLOC_SIZE(nCores);
+	cpu_set_t *cpuset = CPU_ALLOC(nCores);
+	CPU_ZERO_S(size, cpuset);
+	int retval = pthread_getaffinity_np(pthread_self(), size, cpuset);
+	if (retval) {
+		SLog(EWarn, "getCoreCount(): pthread_getaffinity_np(): could "
+			"not read thread affinity map: %s", strerror(retval));
+		__cached_core_count = nCores;
+		CPU_FREE(cpuset);
+		return nCores;
+	}
+
+	int availableCores = 0;
+	for (int i=0; i<nCores; ++i)
+		availableCores += CPU_ISSET_S(i, size, cpuset) ? 1 : 0;
+	CPU_FREE(cpuset);
+	__cached_core_count = availableCores;
+	return availableCores;
 #endif
 }
 
diff --git a/src/mitsuba/mitsuba.cpp b/src/mitsuba/mitsuba.cpp
index b62dde98..d47e06b0 100644
--- a/src/mitsuba/mitsuba.cpp
+++ b/src/mitsuba/mitsuba.cpp
@@ -130,7 +130,8 @@ int mitsuba_app(int argc, char **argv) {
 
 	try {
 		/* Default settings */
-		int nprocs = getCoreCount(), numParallelScenes = 1;
+		int nprocs_avail = getCoreCount(), nprocs = nprocs_avail;
+		int numParallelScenes = 1;
 		std::string nodeName = getHostName(),
 					networkHosts = "", destFile="";
 		bool quietMode = false, progressBars = true, skipExisting = false;
@@ -257,8 +258,10 @@ int mitsuba_app(int argc, char **argv) {
 
 		/* Configure the scheduling subsystem */
 		Scheduler *scheduler = Scheduler::getInstance();
+		bool useCoreAffinity = nprocs == nprocs_avail;
 		for (int i=0; i<nprocs; ++i)
-			scheduler->registerWorker(new LocalWorker(i, formatString("wrk%i", i)));
+			scheduler->registerWorker(new LocalWorker(useCoreAffinity ? i : -1,
+				formatString("wrk%i", i)));
 		std::vector<std::string> hosts = tokenize(networkHosts, ";");
 
 		/* Establish network connections to nested servers */
diff --git a/src/mitsuba/mtsutil.cpp b/src/mitsuba/mtsutil.cpp
index 8d60be3f..2ef15222 100644
--- a/src/mitsuba/mtsutil.cpp
+++ b/src/mitsuba/mtsutil.cpp
@@ -141,7 +141,7 @@ int mtsutil(int argc, char **argv) {
 
 	try {
 		/* Default settings */
-		int nprocs = getCoreCount();
+		int nprocs_avail = getCoreCount(), nprocs = nprocs_avail;
 		std::string nodeName = getHostName(),
 					networkHosts = "", destFile="";
 		bool quietMode = false;
@@ -233,8 +233,11 @@ int mtsutil(int argc, char **argv) {
 
 		/* Configure the scheduling subsystem */
 		Scheduler *scheduler = Scheduler::getInstance();
+		bool useCoreAffinity = nprocs == nprocs_avail;
 		for (int i=0; i<nprocs; ++i)
-			scheduler->registerWorker(new LocalWorker(i, formatString("wrk%i", i)));
+			scheduler->registerWorker(new LocalWorker(useCoreAffinity ? i : -1,
+				formatString("wrk%i", i)));
+
 		std::vector<std::string> hosts = tokenize(networkHosts, ";");
 
 		/* Establish network connections to nested servers */
diff --git a/src/mtsgui/mainwindow.cpp b/src/mtsgui/mainwindow.cpp
index 4ba79eff..2ebb2daa 100644
--- a/src/mtsgui/mainwindow.cpp
+++ b/src/mtsgui/mainwindow.cpp
@@ -53,7 +53,7 @@
 
 extern bool create_symlinks();
 
-static int localWorkerCtr = 0, remoteWorkerCtr = 0;
+static int remoteWorkerCtr = 0;
 
 MainWindow::MainWindow(QWidget *parent) :
 	QMainWindow(parent), ui(new Ui::MainWindow),
@@ -310,8 +310,10 @@ bool MainWindow::initWorkersProcessArgv() {
 
 	m_workerPriority = (Thread::EThreadPriority)
 		settings.value("workerPriority", (int) Thread::ELowPriority).toInt();
+	bool useCoreAffinity = localWorkerCount == getCoreCount();
 	for (int i=0; i<localWorkerCount; ++i)
-		scheduler->registerWorker(new LocalWorker(i, formatString("wrk%i", localWorkerCtr++), m_workerPriority));
+		scheduler->registerWorker(new LocalWorker(useCoreAffinity ? i : -1,
+			formatString("wrk%i", i), m_workerPriority));
 
 	int networkConnections = 0;
 	QList<QVariant> connectionData = settings.value("connections").toList();
@@ -359,7 +361,7 @@ bool MainWindow::initWorkersProcessArgv() {
 		QMessageBox::warning(this, tr("Scheduler warning"),
 			tr("There must be at least one worker thread -- forcing creation of one."),
 			QMessageBox::Ok);
-		scheduler->registerWorker(new LocalWorker(0, formatString("wrk%i", localWorkerCtr++), m_workerPriority));
+		scheduler->registerWorker(new LocalWorker(-1, formatString("wrk%i", 0), m_workerPriority));
 	}
 
 	for (int i=0; i<toBeLoaded.size(); ++i)
@@ -1355,17 +1357,24 @@ void MainWindow::on_actionSettings_triggered() {
 		if (localWorkersChanged || m_connections != d.getConnections()) {
 			ref<Scheduler> sched = Scheduler::getInstance();
 			sched->pause();
-			while (d.getLocalWorkerCount() > (int) localWorkers.size()) {
-				LocalWorker *worker = new LocalWorker(localWorkerCtr, formatString("wrk%i", localWorkerCtr), m_workerPriority);
-				localWorkerCtr++;
-				sched->registerWorker(worker);
-				localWorkers.push_back(worker);
-			}
-			while (d.getLocalWorkerCount() < (int) localWorkers.size()) {
-				Worker *worker = localWorkers.back();
-				sched->unregisterWorker(worker);
-				localWorkers.pop_back();
+
+			if (localWorkers.size() != d.getLocalWorkerCount()) {
+				/* Completely remove old workers so that CPU affinities can be reassigned */
+				while (!localWorkers.empty()) {
+					Worker *worker = localWorkers.back();
+					sched->unregisterWorker(worker);
+					localWorkers.pop_back();
+				}
+				int workerCount = std::max(1, d.getLocalWorkerCount());
+				bool useCoreAffinity = workerCount == getCoreCount();
+				for (int i=0; i<workerCount; ++i) {
+					LocalWorker *worker = new LocalWorker(useCoreAffinity ? i : -1,
+						formatString("wrk%i", i), m_workerPriority);
+					sched->registerWorker(worker);
+					localWorkers.push_back(worker);
+				}
 			}
+
 			QList<ServerConnection> removeList,
 				&newConnections = d.getConnections();
 			for (int i=0; i<m_connections.size(); ++i) {