lwjgl/src/java/org/lwjgl/util/mapped/CacheLineSize.java

141 lines
4.7 KiB
Java

package org.lwjgl.util.mapped;
import org.lwjgl.LWJGLUtil;
import org.lwjgl.MemoryUtil;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.IntBuffer;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import static org.lwjgl.util.mapped.MappedHelper.*;
/**
* This micro-benchmark tries to detect the CPU's cache line size. This is
* done by exploiting cache line false sharing in multi-threaded code:
* When 2 threads simultaneously access the same cache line (and at least
* 1 access is a write), performance drops considerably. We detect this
* performance drop while decreasing the memory padding in every test step.
*
* @author Spasi
*/
final class CacheLineSize {
private CacheLineSize() {
}
static int getCacheLineSize() {
final int THREADS = 2;
final int REPEATS = 100000 * THREADS;
final int LOCAL_REPEATS = REPEATS / THREADS;
// Detection will start from CacheLineMaxSize bytes.
final int MAX_SIZE = LWJGLUtil.getPrivilegedInteger("org.lwjgl.util.mapped.CacheLineMaxSize", 1024) / 4; // in # of integers
// Detection will stop when the execution time increases by more than CacheLineTimeThreshold %.
final double TIME_THRESHOLD = 1.0 + LWJGLUtil.getPrivilegedInteger("org.lwjgl.util.mapped.CacheLineTimeThreshold", 50) / 100.0;
final ExecutorService executorService = Executors.newFixedThreadPool(THREADS);
final ExecutorCompletionService<Long> completionService = new ExecutorCompletionService<Long>(executorService);
try {
// We need to use a NIO buffer in order to guarantee memory alignment.
final IntBuffer memory = getMemory(MAX_SIZE);
// -- WARMUP --
final int WARMUP = 10;
for ( int i = 0; i < WARMUP; i++ )
doTest(THREADS, LOCAL_REPEATS, 0, memory, completionService);
// -- CACHE LINE SIZE DETECTION --
long totalTime = 0;
int count = 0;
int cacheLineSize = 64; // fallback to the most common size these days
boolean found = false;
for ( int i = MAX_SIZE; i >= 1; i >>= 1 ) {
final long time = doTest(THREADS, LOCAL_REPEATS, i, memory, completionService);
if ( totalTime > 0 ) { // Ignore first run
final long avgTime = totalTime / count;
if ( (double)time / (double)avgTime > TIME_THRESHOLD ) { // Try to detect a noticeable jump in execution time
cacheLineSize = (i << 1) * 4;
found = true;
break;
}
}
totalTime += time;
count++;
}
if ( LWJGLUtil.DEBUG ) {
if ( found )
LWJGLUtil.log("Cache line size detected: " + cacheLineSize + " bytes");
else
LWJGLUtil.log("Failed to detect cache line size, assuming " + cacheLineSize + " bytes");
}
return cacheLineSize;
} finally {
executorService.shutdown();
}
}
public static void main(String[] args) {
CacheUtil.getCacheLineSize();
}
static long memoryLoop(final int index, final int repeats, final IntBuffer memory, final int padding) {
final long address = MemoryUtil.getAddress(memory) + (index * padding * 4);
final long time = System.nanoTime();
for ( int i = 0; i < repeats; i++ ) {
// Use volatile access to avoid server VM optimizations.
ivput(ivget(address) + 1, address);
}
return System.nanoTime() - time;
}
private static IntBuffer getMemory(final int START_SIZE) {
final int PAGE_SIZE = MappedObjectUnsafe.INSTANCE.pageSize();
final ByteBuffer buffer = ByteBuffer.allocateDirect((START_SIZE * 4) + PAGE_SIZE).order(ByteOrder.nativeOrder());
// Align to page and, consequently, to cache line. Otherwise results will be inconsistent.
if ( MemoryUtil.getAddress(buffer) % PAGE_SIZE != 0 ) {
// Round up to page boundary
buffer.position(PAGE_SIZE - (int)(MemoryUtil.getAddress(buffer) & (PAGE_SIZE - 1)));
}
return buffer.asIntBuffer();
}
private static long doTest(final int threads, final int repeats, final int padding, final IntBuffer memory, final ExecutorCompletionService<Long> completionService) {
for ( int i = 0; i < threads; i++ )
submitTest(completionService, i, repeats, memory, padding);
return waitForResults(threads, completionService);
}
private static void submitTest(final ExecutorCompletionService<Long> completionService, final int index, final int repeats, final IntBuffer memory, final int padding) {
completionService.submit(new Callable<Long>() {
public Long call() throws Exception {
return memoryLoop(index, repeats, memory, padding);
}
});
}
private static long waitForResults(final int count, final ExecutorCompletionService<Long> completionService) {
try {
long totalTime = 0;
for ( int i = 0; i < count; i++ )
totalTime += completionService.take().get();
return totalTime;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}