lwjgl/src/java/org/lwjgl/util/mapped/CacheLineSize.java

package org.lwjgl.util.mapped;

import org.lwjgl.LWJGLUtil;
import org.lwjgl.MemoryUtil;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.IntBuffer;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import static org.lwjgl.util.mapped.MappedHelper.*;

/**
 * This micro-benchmark tries to detect the CPU's cache line size. This is
 * done by exploiting cache line false sharing in multi-threaded code:
 * When 2 threads simultaneously access the same cache line (and at least
 * 1 access is a write), performance drops considerably. We detect this
 * performance drop while decreasing the memory padding in every test step.
 *
 * @author Spasi
 */
final class CacheLineSize {

	private CacheLineSize() {
	}

	static int getCacheLineSize() {
		final int THREADS = 2;
		final int REPEATS = 100000 * THREADS;
		final int LOCAL_REPEATS = REPEATS / THREADS;

		// Detection will start from CacheLineMaxSize bytes.
		final int MAX_SIZE = LWJGLUtil.getPrivilegedInteger("org.lwjgl.util.mapped.CacheLineMaxSize", 1024) / 4; // in # of integers
		// Detection will stop when the execution time increases by more than CacheLineTimeThreshold %.
		final double TIME_THRESHOLD = 1.0 + LWJGLUtil.getPrivilegedInteger("org.lwjgl.util.mapped.CacheLineTimeThreshold", 50) / 100.0;

		final ExecutorService executorService = Executors.newFixedThreadPool(THREADS);
		final ExecutorCompletionService<Long> completionService = new ExecutorCompletionService<Long>(executorService);

		try {
			// We need to use a NIO buffer in order to guarantee memory alignment.
			final IntBuffer memory = getMemory(MAX_SIZE);

			// -- WARMUP --

			final int WARMUP = 10;
			for ( int i = 0; i < WARMUP; i++ )
				doTest(THREADS, LOCAL_REPEATS, 0, memory, completionService);

			// -- CACHE LINE SIZE DETECTION --

			long totalTime = 0;
			int count = 0;
			int cacheLineSize = 64; // fallback to the most common size these days
			boolean found = false;
			for ( int i = MAX_SIZE; i >= 1; i >>= 1 ) {
				final long time = doTest(THREADS, LOCAL_REPEATS, i, memory, completionService);
				if ( totalTime > 0 ) { // Ignore first run
					final long avgTime = totalTime / count;
					if ( (double)time / (double)avgTime > TIME_THRESHOLD ) { // Try to detect a noticeable jump in execution time
						cacheLineSize = (i << 1) * 4;
						found = true;
						break;
					}
				}
				totalTime += time;
				count++;
			}

			if ( LWJGLUtil.DEBUG ) {
				if ( found )
					LWJGLUtil.log("Cache line size detected: " + cacheLineSize + " bytes");
				else
					LWJGLUtil.log("Failed to detect cache line size, assuming " + cacheLineSize + " bytes");
			}

			return cacheLineSize;
		} finally {
			executorService.shutdown();
		}
	}

	public static void main(String[] args) {
		CacheUtil.getCacheLineSize();
	}

	static long memoryLoop(final int index, final int repeats, final IntBuffer memory, final int padding) {
		final long address = MemoryUtil.getAddress(memory) + (index * padding * 4);

		final long time = System.nanoTime();
		for ( int i = 0; i < repeats; i++ ) {
			// Use volatile access to avoid server VM optimizations.
			ivput(ivget(address) + 1, address);
		}

		return System.nanoTime() - time;
	}

	private static IntBuffer getMemory(final int START_SIZE) {
		final int PAGE_SIZE = MappedObjectUnsafe.INSTANCE.pageSize();

		final ByteBuffer buffer = ByteBuffer.allocateDirect((START_SIZE * 4) + PAGE_SIZE).order(ByteOrder.nativeOrder());

		// Align to page and, consequently, to cache line. Otherwise results will be inconsistent.
		if ( MemoryUtil.getAddress(buffer) % PAGE_SIZE != 0 ) {
			// Round up to page boundary
			buffer.position(PAGE_SIZE - (int)(MemoryUtil.getAddress(buffer) & (PAGE_SIZE - 1)));
		}

		return buffer.asIntBuffer();
	}

	private static long doTest(final int threads, final int repeats, final int padding, final IntBuffer memory, final ExecutorCompletionService<Long> completionService) {
		for ( int i = 0; i < threads; i++ )
			submitTest(completionService, i, repeats, memory, padding);
		return waitForResults(threads, completionService);
	}

	private static void submitTest(final ExecutorCompletionService<Long> completionService, final int index, final int repeats, final IntBuffer memory, final int padding) {
		completionService.submit(new Callable<Long>() {
			public Long call() throws Exception {
				return memoryLoop(index, repeats, memory, padding);
			}
		});
	}

	private static long waitForResults(final int count, final ExecutorCompletionService<Long> completionService) {
		try {
			long totalTime = 0;
			for ( int i = 0; i < count; i++ )
				totalTime += completionService.take().get();
			return totalTime;
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
	}

}