Fixed core affinity handling, specified TLS dialect on Linux

- Fixed core affinity handling for Linux clusters with CPU quotas
  (e.g. the ETH Brutus cluster).

- Added an explicit -mtls-dialect flag to the Linux config.py files to
  prevent crashes when loading the Python plugin in certain settings
  (particularly: the ETH cluster).
metadata
Wenzel Jakob 2014-07-01 22:35:30 +02:00
parent a1eecbb55a
commit fbb4867f49
9 changed files with 124 additions and 35 deletions

View File

@ -4,7 +4,7 @@ BUILDDIR = '#build/debug'
DISTDIR = '#dist' DISTDIR = '#dist'
CXX = 'g++' CXX = 'g++'
CC = 'gcc' CC = 'gcc'
CXXFLAGS = ['-O0', '-Wall', '-g', '-pipe', '-march=nocona', '-msse2', '-ftree-vectorize', '-mfpmath=sse', '-funsafe-math-optimizations', '-fno-rounding-math', '-fno-signaling-nans', '-fno-math-errno', '-fno-omit-frame-pointer', '-DMTS_DEBUG', '-DSINGLE_PRECISION', '-DSPECTRUM_SAMPLES=3', '-DMTS_SSE', '-DMTS_HAS_COHERENT_RT', '-fopenmp', '-fvisibility=hidden'] CXXFLAGS = ['-O0', '-Wall', '-g', '-pipe', '-march=nocona', '-msse2', '-ftree-vectorize', '-mfpmath=sse', '-funsafe-math-optimizations', '-fno-rounding-math', '-fno-signaling-nans', '-fno-math-errno', '-fno-omit-frame-pointer', '-DMTS_DEBUG', '-DSINGLE_PRECISION', '-DSPECTRUM_SAMPLES=3', '-DMTS_SSE', '-DMTS_HAS_COHERENT_RT', '-fopenmp', '-fvisibility=hidden', '-mtls-dialect=gnu2']
LINKFLAGS = [] LINKFLAGS = []
SHLINKFLAGS = ['-rdynamic', '-shared', '-fPIC', '-lstdc++'] SHLINKFLAGS = ['-rdynamic', '-shared', '-fPIC', '-lstdc++']
BASEINCLUDE = ['#include'] BASEINCLUDE = ['#include']

View File

@ -4,7 +4,7 @@ BUILDDIR = '#build/release'
DISTDIR = '#dist' DISTDIR = '#dist'
CXX = 'g++' CXX = 'g++'
CC = 'gcc' CC = 'gcc'
CXXFLAGS = ['-O3', '-Wall', '-g', '-pipe', '-march=nocona', '-msse2', '-ftree-vectorize', '-mfpmath=sse', '-funsafe-math-optimizations', '-fno-rounding-math', '-fno-signaling-nans', '-fno-math-errno', '-fomit-frame-pointer', '-DMTS_DEBUG', '-DSINGLE_PRECISION', '-DSPECTRUM_SAMPLES=3', '-DMTS_SSE', '-DMTS_HAS_COHERENT_RT', '-fopenmp', '-fvisibility=hidden'] CXXFLAGS = ['-O3', '-Wall', '-g', '-pipe', '-march=nocona', '-msse2', '-ftree-vectorize', '-mfpmath=sse', '-funsafe-math-optimizations', '-fno-rounding-math', '-fno-signaling-nans', '-fno-math-errno', '-fomit-frame-pointer', '-DMTS_DEBUG', '-DSINGLE_PRECISION', '-DSPECTRUM_SAMPLES=3', '-DMTS_SSE', '-DMTS_HAS_COHERENT_RT', '-fopenmp', '-fvisibility=hidden', '-mtls-dialect=gnu2']
LINKFLAGS = [] LINKFLAGS = []
SHLINKFLAGS = ['-rdynamic', '-shared', '-fPIC', '-lstdc++'] SHLINKFLAGS = ['-rdynamic', '-shared', '-fPIC', '-lstdc++']
BASEINCLUDE = ['#include'] BASEINCLUDE = ['#include']

View File

@ -758,6 +758,21 @@ protected:
*/ */
class MTS_EXPORT_CORE LocalWorker : public Worker { class MTS_EXPORT_CORE LocalWorker : public Worker {
public: public:
/**
* \brief Create a new local worker thread
*
* \param coreID
* When an CPU core ID (>=0) is specified here, the worker
* thread will attempt to register core affinity with the
* operating system. Passing -1 disables this.
*
* \param name
* An identifying string for this thread
*
* \param priority
* The desired thread priority (not supported on some
* operating systems)
*/
LocalWorker(int coreID, const std::string &name, LocalWorker(int coreID, const std::string &name,
Thread::EThreadPriority priority = Thread::ENormalPriority); Thread::EThreadPriority priority = Thread::ENormalPriority);

View File

@ -633,6 +633,7 @@ void Worker::start(Scheduler *scheduler, int workerIndex, int coreOffset) {
LocalWorker::LocalWorker(int coreID, const std::string &name, LocalWorker::LocalWorker(int coreID, const std::string &name,
Thread::EThreadPriority priority) : Worker(name) { Thread::EThreadPriority priority) : Worker(name) {
if (coreID >= 0)
setCoreAffinity(coreID); setCoreAffinity(coreID);
m_coreCount = 1; m_coreCount = 1;
#if !defined(__LINUX__) #if !defined(__LINUX__)

View File

@ -324,24 +324,51 @@ void Thread::setCoreAffinity(int coreID) {
if (getenv("VALGRIND_OPTS") != NULL) if (getenv("VALGRIND_OPTS") != NULL)
return; return;
int nCores = getCoreCount(); int nCores = sysconf(_SC_NPROCESSORS_CONF);
cpu_set_t *cpuset = CPU_ALLOC(nCores);
if (cpuset == NULL)
Log(EError, "Thread::setCoreAffinity(): could not allocate cpu_set_t");
size_t size = CPU_ALLOC_SIZE(nCores); size_t size = CPU_ALLOC_SIZE(nCores);
cpu_set_t *cpuset = CPU_ALLOC(nCores);
CPU_ZERO_S(size, cpuset); CPU_ZERO_S(size, cpuset);
if (coreID != -1 && coreID < nCores) { if (cpuset == NULL) {
CPU_SET_S(coreID, size, cpuset); Log(EWarn, "Thread::setCoreAffinity(): could not allocate cpu_set_t");
} else { return;
for (int i=0; i<nCores; ++i)
CPU_SET_S(i, size, cpuset);
} }
const pthread_t threadID = d->thread.native_handle(); const pthread_t threadID = d->thread.native_handle();
int retval = pthread_setaffinity_np(threadID, size, cpuset); int retval = pthread_getaffinity_np(threadID, size, cpuset);
if (retval) if (retval) {
Log(EWarn, "Thread::setCoreAffinity(): pthread_setaffinity_np: failed: %s", strerror(errno)); Log(EWarn, "Thread::setCoreAffinity(): pthread_getaffinity_np(): could "
"not read thread affinity map: %s", strerror(retval));
CPU_FREE(cpuset);
return;
}
int actualCoreID = -1, available = 0;
for (int i=0; i<nCores; ++i) {
if (!CPU_ISSET_S(i, size, cpuset))
continue;
if (available++ == coreID) {
actualCoreID = i;
break;
}
}
if (actualCoreID == -1) {
Log(EWarn, "Thread::setCoreAffinity(): out of bounds: %i/%i cores available, requested #%i!",
available, nCores, coreID);
CPU_FREE(cpuset);
return;
}
CPU_ZERO_S(size, cpuset);
CPU_SET_S(actualCoreID, size, cpuset);
retval = pthread_setaffinity_np(threadID, size, cpuset);
if (retval) {
Log(EWarn, "Thread::setCoreAffinity(): pthread_setaffinity_np: failed: %s", strerror(retval));
CPU_FREE(cpuset);
return;
}
CPU_FREE(cpuset); CPU_FREE(cpuset);
#elif defined(__WINDOWS__) #elif defined(__WINDOWS__)
int nCores = getCoreCount(); int nCores = getCoreCount();

View File

@ -143,19 +143,50 @@ void freeAligned(void *ptr) {
#endif #endif
} }
static int __cached_core_count = 0;
int getCoreCount() { int getCoreCount() {
// assumes atomic word size memory access
if (__cached_core_count)
return __cached_core_count;
#if defined(__WINDOWS__) #if defined(__WINDOWS__)
SYSTEM_INFO sys_info; SYSTEM_INFO sys_info;
GetSystemInfo(&sys_info); GetSystemInfo(&sys_info);
__cached_core_count = sys_info.dwNumberOfProcessors;
return sys_info.dwNumberOfProcessors; return sys_info.dwNumberOfProcessors;
#elif defined(__OSX__) #elif defined(__OSX__)
int nprocs; int nprocs;
size_t nprocsSize = sizeof(int); size_t nprocsSize = sizeof(int);
if (sysctlbyname("hw.activecpu", &nprocs, &nprocsSize, NULL, 0)) if (sysctlbyname("hw.activecpu", &nprocs, &nprocsSize, NULL, 0))
SLog(EError, "Could not detect the number of processors!"); SLog(EError, "Could not detect the number of processors!");
return (int) nprocs; __cached_core_count = nprocs;
return nprocs;
#else #else
return sysconf(_SC_NPROCESSORS_CONF); /* Determine the number of present cores */
int nCores = sysconf(_SC_NPROCESSORS_CONF);
/* Some of the cores may not be available to the user
(e.g. on certain cluster nodes) -- determine the number
of actual available cores here. */
size_t size = CPU_ALLOC_SIZE(nCores);
cpu_set_t *cpuset = CPU_ALLOC(nCores);
CPU_ZERO_S(size, cpuset);
int retval = pthread_getaffinity_np(pthread_self(), size, cpuset);
if (retval) {
SLog(EWarn, "getCoreCount(): pthread_getaffinity_np(): could "
"not read thread affinity map: %s", strerror(retval));
__cached_core_count = nCores;
CPU_FREE(cpuset);
return nCores;
}
int availableCores = 0;
for (int i=0; i<nCores; ++i)
availableCores += CPU_ISSET_S(i, size, cpuset) ? 1 : 0;
CPU_FREE(cpuset);
__cached_core_count = availableCores;
return availableCores;
#endif #endif
} }

View File

@ -130,7 +130,8 @@ int mitsuba_app(int argc, char **argv) {
try { try {
/* Default settings */ /* Default settings */
int nprocs = getCoreCount(), numParallelScenes = 1; int nprocs_avail = getCoreCount(), nprocs = nprocs_avail;
int numParallelScenes = 1;
std::string nodeName = getHostName(), std::string nodeName = getHostName(),
networkHosts = "", destFile=""; networkHosts = "", destFile="";
bool quietMode = false, progressBars = true, skipExisting = false; bool quietMode = false, progressBars = true, skipExisting = false;
@ -257,8 +258,10 @@ int mitsuba_app(int argc, char **argv) {
/* Configure the scheduling subsystem */ /* Configure the scheduling subsystem */
Scheduler *scheduler = Scheduler::getInstance(); Scheduler *scheduler = Scheduler::getInstance();
bool useCoreAffinity = nprocs == nprocs_avail;
for (int i=0; i<nprocs; ++i) for (int i=0; i<nprocs; ++i)
scheduler->registerWorker(new LocalWorker(i, formatString("wrk%i", i))); scheduler->registerWorker(new LocalWorker(useCoreAffinity ? i : -1,
formatString("wrk%i", i)));
std::vector<std::string> hosts = tokenize(networkHosts, ";"); std::vector<std::string> hosts = tokenize(networkHosts, ";");
/* Establish network connections to nested servers */ /* Establish network connections to nested servers */

View File

@ -141,7 +141,7 @@ int mtsutil(int argc, char **argv) {
try { try {
/* Default settings */ /* Default settings */
int nprocs = getCoreCount(); int nprocs_avail = getCoreCount(), nprocs = nprocs_avail;
std::string nodeName = getHostName(), std::string nodeName = getHostName(),
networkHosts = "", destFile=""; networkHosts = "", destFile="";
bool quietMode = false; bool quietMode = false;
@ -233,8 +233,11 @@ int mtsutil(int argc, char **argv) {
/* Configure the scheduling subsystem */ /* Configure the scheduling subsystem */
Scheduler *scheduler = Scheduler::getInstance(); Scheduler *scheduler = Scheduler::getInstance();
bool useCoreAffinity = nprocs == nprocs_avail;
for (int i=0; i<nprocs; ++i) for (int i=0; i<nprocs; ++i)
scheduler->registerWorker(new LocalWorker(i, formatString("wrk%i", i))); scheduler->registerWorker(new LocalWorker(useCoreAffinity ? i : -1,
formatString("wrk%i", i)));
std::vector<std::string> hosts = tokenize(networkHosts, ";"); std::vector<std::string> hosts = tokenize(networkHosts, ";");
/* Establish network connections to nested servers */ /* Establish network connections to nested servers */

View File

@ -53,7 +53,7 @@
extern bool create_symlinks(); extern bool create_symlinks();
static int localWorkerCtr = 0, remoteWorkerCtr = 0; static int remoteWorkerCtr = 0;
MainWindow::MainWindow(QWidget *parent) : MainWindow::MainWindow(QWidget *parent) :
QMainWindow(parent), ui(new Ui::MainWindow), QMainWindow(parent), ui(new Ui::MainWindow),
@ -310,8 +310,10 @@ bool MainWindow::initWorkersProcessArgv() {
m_workerPriority = (Thread::EThreadPriority) m_workerPriority = (Thread::EThreadPriority)
settings.value("workerPriority", (int) Thread::ELowPriority).toInt(); settings.value("workerPriority", (int) Thread::ELowPriority).toInt();
bool useCoreAffinity = localWorkerCount == getCoreCount();
for (int i=0; i<localWorkerCount; ++i) for (int i=0; i<localWorkerCount; ++i)
scheduler->registerWorker(new LocalWorker(i, formatString("wrk%i", localWorkerCtr++), m_workerPriority)); scheduler->registerWorker(new LocalWorker(useCoreAffinity ? i : -1,
formatString("wrk%i", i), m_workerPriority));
int networkConnections = 0; int networkConnections = 0;
QList<QVariant> connectionData = settings.value("connections").toList(); QList<QVariant> connectionData = settings.value("connections").toList();
@ -359,7 +361,7 @@ bool MainWindow::initWorkersProcessArgv() {
QMessageBox::warning(this, tr("Scheduler warning"), QMessageBox::warning(this, tr("Scheduler warning"),
tr("There must be at least one worker thread -- forcing creation of one."), tr("There must be at least one worker thread -- forcing creation of one."),
QMessageBox::Ok); QMessageBox::Ok);
scheduler->registerWorker(new LocalWorker(0, formatString("wrk%i", localWorkerCtr++), m_workerPriority)); scheduler->registerWorker(new LocalWorker(-1, formatString("wrk%i", 0), m_workerPriority));
} }
for (int i=0; i<toBeLoaded.size(); ++i) for (int i=0; i<toBeLoaded.size(); ++i)
@ -1355,17 +1357,24 @@ void MainWindow::on_actionSettings_triggered() {
if (localWorkersChanged || m_connections != d.getConnections()) { if (localWorkersChanged || m_connections != d.getConnections()) {
ref<Scheduler> sched = Scheduler::getInstance(); ref<Scheduler> sched = Scheduler::getInstance();
sched->pause(); sched->pause();
while (d.getLocalWorkerCount() > (int) localWorkers.size()) {
LocalWorker *worker = new LocalWorker(localWorkerCtr, formatString("wrk%i", localWorkerCtr), m_workerPriority); if (localWorkers.size() != d.getLocalWorkerCount()) {
localWorkerCtr++; /* Completely remove old workers so that CPU affinities can be reassigned */
sched->registerWorker(worker); while (!localWorkers.empty()) {
localWorkers.push_back(worker);
}
while (d.getLocalWorkerCount() < (int) localWorkers.size()) {
Worker *worker = localWorkers.back(); Worker *worker = localWorkers.back();
sched->unregisterWorker(worker); sched->unregisterWorker(worker);
localWorkers.pop_back(); localWorkers.pop_back();
} }
int workerCount = std::max(1, d.getLocalWorkerCount());
bool useCoreAffinity = workerCount == getCoreCount();
for (int i=0; i<workerCount; ++i) {
LocalWorker *worker = new LocalWorker(useCoreAffinity ? i : -1,
formatString("wrk%i", i), m_workerPriority);
sched->registerWorker(worker);
localWorkers.push_back(worker);
}
}
QList<ServerConnection> removeList, QList<ServerConnection> removeList,
&newConnections = d.getConnections(); &newConnections = d.getConnections();
for (int i=0; i<m_connections.size(); ++i) { for (int i=0; i<m_connections.size(); ++i) {