Fixed core affinity handling, specified TLS dialect on Linux

- Fixed core affinity handling for Linux clusters with CPU quotas
  (e.g. the ETH Brutus cluster).

- Added an explicit -mtls-dialect flag to the Linux config.py files to
  prevent crashes when loading the Python plugin in certain settings
  (particularly: the ETH cluster).
metadata
Wenzel Jakob 2014-07-01 22:35:30 +02:00
parent a1eecbb55a
commit fbb4867f49
9 changed files with 124 additions and 35 deletions

View File

@ -4,7 +4,7 @@ BUILDDIR = '#build/debug'
DISTDIR = '#dist'
CXX = 'g++'
CC = 'gcc'
CXXFLAGS = ['-O0', '-Wall', '-g', '-pipe', '-march=nocona', '-msse2', '-ftree-vectorize', '-mfpmath=sse', '-funsafe-math-optimizations', '-fno-rounding-math', '-fno-signaling-nans', '-fno-math-errno', '-fno-omit-frame-pointer', '-DMTS_DEBUG', '-DSINGLE_PRECISION', '-DSPECTRUM_SAMPLES=3', '-DMTS_SSE', '-DMTS_HAS_COHERENT_RT', '-fopenmp', '-fvisibility=hidden']
CXXFLAGS = ['-O0', '-Wall', '-g', '-pipe', '-march=nocona', '-msse2', '-ftree-vectorize', '-mfpmath=sse', '-funsafe-math-optimizations', '-fno-rounding-math', '-fno-signaling-nans', '-fno-math-errno', '-fno-omit-frame-pointer', '-DMTS_DEBUG', '-DSINGLE_PRECISION', '-DSPECTRUM_SAMPLES=3', '-DMTS_SSE', '-DMTS_HAS_COHERENT_RT', '-fopenmp', '-fvisibility=hidden', '-mtls-dialect=gnu2']
LINKFLAGS = []
SHLINKFLAGS = ['-rdynamic', '-shared', '-fPIC', '-lstdc++']
BASEINCLUDE = ['#include']

View File

@ -4,7 +4,7 @@ BUILDDIR = '#build/release'
DISTDIR = '#dist'
CXX = 'g++'
CC = 'gcc'
CXXFLAGS = ['-O3', '-Wall', '-g', '-pipe', '-march=nocona', '-msse2', '-ftree-vectorize', '-mfpmath=sse', '-funsafe-math-optimizations', '-fno-rounding-math', '-fno-signaling-nans', '-fno-math-errno', '-fomit-frame-pointer', '-DMTS_DEBUG', '-DSINGLE_PRECISION', '-DSPECTRUM_SAMPLES=3', '-DMTS_SSE', '-DMTS_HAS_COHERENT_RT', '-fopenmp', '-fvisibility=hidden']
CXXFLAGS = ['-O3', '-Wall', '-g', '-pipe', '-march=nocona', '-msse2', '-ftree-vectorize', '-mfpmath=sse', '-funsafe-math-optimizations', '-fno-rounding-math', '-fno-signaling-nans', '-fno-math-errno', '-fomit-frame-pointer', '-DMTS_DEBUG', '-DSINGLE_PRECISION', '-DSPECTRUM_SAMPLES=3', '-DMTS_SSE', '-DMTS_HAS_COHERENT_RT', '-fopenmp', '-fvisibility=hidden', '-mtls-dialect=gnu2']
LINKFLAGS = []
SHLINKFLAGS = ['-rdynamic', '-shared', '-fPIC', '-lstdc++']
BASEINCLUDE = ['#include']

View File

@ -758,6 +758,21 @@ protected:
*/
class MTS_EXPORT_CORE LocalWorker : public Worker {
public:
/**
* \brief Create a new local worker thread
*
* \param coreID
* When an CPU core ID (>=0) is specified here, the worker
* thread will attempt to register core affinity with the
* operating system. Passing -1 disables this.
*
* \param name
* An identifying string for this thread
*
* \param priority
* The desired thread priority (not supported on some
* operating systems)
*/
LocalWorker(int coreID, const std::string &name,
Thread::EThreadPriority priority = Thread::ENormalPriority);

View File

@ -633,7 +633,8 @@ void Worker::start(Scheduler *scheduler, int workerIndex, int coreOffset) {
LocalWorker::LocalWorker(int coreID, const std::string &name,
Thread::EThreadPriority priority) : Worker(name) {
setCoreAffinity(coreID);
if (coreID >= 0)
setCoreAffinity(coreID);
m_coreCount = 1;
#if !defined(__LINUX__)
/* Don't set thead priority on Linux, since it uses

View File

@ -324,24 +324,51 @@ void Thread::setCoreAffinity(int coreID) {
if (getenv("VALGRIND_OPTS") != NULL)
return;
int nCores = getCoreCount();
cpu_set_t *cpuset = CPU_ALLOC(nCores);
if (cpuset == NULL)
Log(EError, "Thread::setCoreAffinity(): could not allocate cpu_set_t");
int nCores = sysconf(_SC_NPROCESSORS_CONF);
size_t size = CPU_ALLOC_SIZE(nCores);
cpu_set_t *cpuset = CPU_ALLOC(nCores);
CPU_ZERO_S(size, cpuset);
if (coreID != -1 && coreID < nCores) {
CPU_SET_S(coreID, size, cpuset);
} else {
for (int i=0; i<nCores; ++i)
CPU_SET_S(i, size, cpuset);
if (cpuset == NULL) {
Log(EWarn, "Thread::setCoreAffinity(): could not allocate cpu_set_t");
return;
}
const pthread_t threadID = d->thread.native_handle();
int retval = pthread_setaffinity_np(threadID, size, cpuset);
if (retval)
Log(EWarn, "Thread::setCoreAffinity(): pthread_setaffinity_np: failed: %s", strerror(errno));
int retval = pthread_getaffinity_np(threadID, size, cpuset);
if (retval) {
Log(EWarn, "Thread::setCoreAffinity(): pthread_getaffinity_np(): could "
"not read thread affinity map: %s", strerror(retval));
CPU_FREE(cpuset);
return;
}
int actualCoreID = -1, available = 0;
for (int i=0; i<nCores; ++i) {
if (!CPU_ISSET_S(i, size, cpuset))
continue;
if (available++ == coreID) {
actualCoreID = i;
break;
}
}
if (actualCoreID == -1) {
Log(EWarn, "Thread::setCoreAffinity(): out of bounds: %i/%i cores available, requested #%i!",
available, nCores, coreID);
CPU_FREE(cpuset);
return;
}
CPU_ZERO_S(size, cpuset);
CPU_SET_S(actualCoreID, size, cpuset);
retval = pthread_setaffinity_np(threadID, size, cpuset);
if (retval) {
Log(EWarn, "Thread::setCoreAffinity(): pthread_setaffinity_np: failed: %s", strerror(retval));
CPU_FREE(cpuset);
return;
}
CPU_FREE(cpuset);
#elif defined(__WINDOWS__)
int nCores = getCoreCount();

View File

@ -143,19 +143,50 @@ void freeAligned(void *ptr) {
#endif
}
static int __cached_core_count = 0;
int getCoreCount() {
// assumes atomic word size memory access
if (__cached_core_count)
return __cached_core_count;
#if defined(__WINDOWS__)
SYSTEM_INFO sys_info;
GetSystemInfo(&sys_info);
__cached_core_count = sys_info.dwNumberOfProcessors;
return sys_info.dwNumberOfProcessors;
#elif defined(__OSX__)
int nprocs;
size_t nprocsSize = sizeof(int);
if (sysctlbyname("hw.activecpu", &nprocs, &nprocsSize, NULL, 0))
SLog(EError, "Could not detect the number of processors!");
return (int) nprocs;
__cached_core_count = nprocs;
return nprocs;
#else
return sysconf(_SC_NPROCESSORS_CONF);
/* Determine the number of present cores */
int nCores = sysconf(_SC_NPROCESSORS_CONF);
/* Some of the cores may not be available to the user
(e.g. on certain cluster nodes) -- determine the number
of actual available cores here. */
size_t size = CPU_ALLOC_SIZE(nCores);
cpu_set_t *cpuset = CPU_ALLOC(nCores);
CPU_ZERO_S(size, cpuset);
int retval = pthread_getaffinity_np(pthread_self(), size, cpuset);
if (retval) {
SLog(EWarn, "getCoreCount(): pthread_getaffinity_np(): could "
"not read thread affinity map: %s", strerror(retval));
__cached_core_count = nCores;
CPU_FREE(cpuset);
return nCores;
}
int availableCores = 0;
for (int i=0; i<nCores; ++i)
availableCores += CPU_ISSET_S(i, size, cpuset) ? 1 : 0;
CPU_FREE(cpuset);
__cached_core_count = availableCores;
return availableCores;
#endif
}

View File

@ -130,7 +130,8 @@ int mitsuba_app(int argc, char **argv) {
try {
/* Default settings */
int nprocs = getCoreCount(), numParallelScenes = 1;
int nprocs_avail = getCoreCount(), nprocs = nprocs_avail;
int numParallelScenes = 1;
std::string nodeName = getHostName(),
networkHosts = "", destFile="";
bool quietMode = false, progressBars = true, skipExisting = false;
@ -257,8 +258,10 @@ int mitsuba_app(int argc, char **argv) {
/* Configure the scheduling subsystem */
Scheduler *scheduler = Scheduler::getInstance();
bool useCoreAffinity = nprocs == nprocs_avail;
for (int i=0; i<nprocs; ++i)
scheduler->registerWorker(new LocalWorker(i, formatString("wrk%i", i)));
scheduler->registerWorker(new LocalWorker(useCoreAffinity ? i : -1,
formatString("wrk%i", i)));
std::vector<std::string> hosts = tokenize(networkHosts, ";");
/* Establish network connections to nested servers */

View File

@ -141,7 +141,7 @@ int mtsutil(int argc, char **argv) {
try {
/* Default settings */
int nprocs = getCoreCount();
int nprocs_avail = getCoreCount(), nprocs = nprocs_avail;
std::string nodeName = getHostName(),
networkHosts = "", destFile="";
bool quietMode = false;
@ -233,8 +233,11 @@ int mtsutil(int argc, char **argv) {
/* Configure the scheduling subsystem */
Scheduler *scheduler = Scheduler::getInstance();
bool useCoreAffinity = nprocs == nprocs_avail;
for (int i=0; i<nprocs; ++i)
scheduler->registerWorker(new LocalWorker(i, formatString("wrk%i", i)));
scheduler->registerWorker(new LocalWorker(useCoreAffinity ? i : -1,
formatString("wrk%i", i)));
std::vector<std::string> hosts = tokenize(networkHosts, ";");
/* Establish network connections to nested servers */

View File

@ -53,7 +53,7 @@
extern bool create_symlinks();
static int localWorkerCtr = 0, remoteWorkerCtr = 0;
static int remoteWorkerCtr = 0;
MainWindow::MainWindow(QWidget *parent) :
QMainWindow(parent), ui(new Ui::MainWindow),
@ -310,8 +310,10 @@ bool MainWindow::initWorkersProcessArgv() {
m_workerPriority = (Thread::EThreadPriority)
settings.value("workerPriority", (int) Thread::ELowPriority).toInt();
bool useCoreAffinity = localWorkerCount == getCoreCount();
for (int i=0; i<localWorkerCount; ++i)
scheduler->registerWorker(new LocalWorker(i, formatString("wrk%i", localWorkerCtr++), m_workerPriority));
scheduler->registerWorker(new LocalWorker(useCoreAffinity ? i : -1,
formatString("wrk%i", i), m_workerPriority));
int networkConnections = 0;
QList<QVariant> connectionData = settings.value("connections").toList();
@ -359,7 +361,7 @@ bool MainWindow::initWorkersProcessArgv() {
QMessageBox::warning(this, tr("Scheduler warning"),
tr("There must be at least one worker thread -- forcing creation of one."),
QMessageBox::Ok);
scheduler->registerWorker(new LocalWorker(0, formatString("wrk%i", localWorkerCtr++), m_workerPriority));
scheduler->registerWorker(new LocalWorker(-1, formatString("wrk%i", 0), m_workerPriority));
}
for (int i=0; i<toBeLoaded.size(); ++i)
@ -1355,17 +1357,24 @@ void MainWindow::on_actionSettings_triggered() {
if (localWorkersChanged || m_connections != d.getConnections()) {
ref<Scheduler> sched = Scheduler::getInstance();
sched->pause();
while (d.getLocalWorkerCount() > (int) localWorkers.size()) {
LocalWorker *worker = new LocalWorker(localWorkerCtr, formatString("wrk%i", localWorkerCtr), m_workerPriority);
localWorkerCtr++;
sched->registerWorker(worker);
localWorkers.push_back(worker);
}
while (d.getLocalWorkerCount() < (int) localWorkers.size()) {
Worker *worker = localWorkers.back();
sched->unregisterWorker(worker);
localWorkers.pop_back();
if (localWorkers.size() != d.getLocalWorkerCount()) {
/* Completely remove old workers so that CPU affinities can be reassigned */
while (!localWorkers.empty()) {
Worker *worker = localWorkers.back();
sched->unregisterWorker(worker);
localWorkers.pop_back();
}
int workerCount = std::max(1, d.getLocalWorkerCount());
bool useCoreAffinity = workerCount == getCoreCount();
for (int i=0; i<workerCount; ++i) {
LocalWorker *worker = new LocalWorker(useCoreAffinity ? i : -1,
formatString("wrk%i", i), m_workerPriority);
sched->registerWorker(worker);
localWorkers.push_back(worker);
}
}
QList<ServerConnection> removeList,
&newConnections = d.getConnections();
for (int i=0; i<m_connections.size(); ++i) {