[Likwid-commit] [likwid] 01/09: Imported Upstream version 3.1.3

Christoph Martin chrism at debian.org
Mon Mar 2 15:31:05 UTC 2015


This is an automated email from the git hooks/post-receive script.

chrism pushed a commit to branch master
in repository likwid.

commit 7c57191e41356c3f95f7a50a64bf20ab063c652c
Author: Christoph Martin <martin at uni-mainz.de>
Date:   Wed Feb 25 17:26:21 2015 +0100

    Imported Upstream version 3.1.3
---
 INSTALL                                            |   10 +-
 Makefile                                           |   82 +-
 README                                             |    2 +-
 bench/x86-64/branch.ptt                            |   36 +
 bench/x86-64/copy_avx.ptt                          |   15 +
 bench/x86-64/copy_mem_avx.ptt                      |   14 +
 bench/x86-64/copy_mem_sse.ptt                      |   15 +
 bench/x86-64/copy_plain.ptt                        |   16 +
 bench/x86-64/copy_sse.ptt                          |   15 +
 bench/x86-64/load_avx.ptt                          |   12 +
 bench/x86-64/load_plain.ptt                        |   12 +
 bench/x86-64/load_sse.ptt                          |   12 +
 bench/x86-64/peak_avx.ptt                          |   49 +
 bench/x86-64/peak_sse.ptt                          |   49 +
 bench/x86-64/peakflops_avx.ptt                     |   37 +
 bench/x86-64/peakflops_sse.ptt                     |   37 +
 bench/x86-64/store_avx.ptt                         |   15 +
 bench/x86-64/store_mem_avx.ptt                     |   14 +
 bench/x86-64/store_mem_sse.ptt                     |   14 +
 bench/x86-64/store_plain.ptt                       |   15 +
 bench/x86-64/store_sse.ptt                         |   15 +
 bench/x86-64/stream_avx.ptt                        |   22 +
 bench/x86-64/striad_avx.ptt                        |   23 +
 bench/x86-64/striad_mem_avx.ptt                    |   11 +
 bench/x86-64/striad_mem_sse.ptt                    |   11 +
 bench/x86-64/striad_plain.ptt                      |   23 +
 bench/x86-64/striad_sse.ptt                        |   23 +
 bench/x86-64/sum_sse.ptt                           |   23 +
 bench/x86-64/triad_avx.ptt                         |   12 +
 bench/x86-64/triad_split.ptt                       |   30 +
 bench/x86-64/update_avx.ptt                        |   15 +
 bench/x86-64/update_plain.ptt                      |   15 +
 bench/x86-64/update_sse.ptt                        |   15 +
 bench/x86-64/vtriad_avx.ptt                        |   22 +
 bench/x86-64/vtriad_mem_avx.ptt                    |   10 +
 bench/x86-64/vtriad_mem_sse.ptt                    |   10 +
 bench/x86-64/vtriad_plain.ptt                      |   22 +
 bench/x86-64/vtriad_sse.ptt                        |   22 +
 config.mk                                          |    1 -
 doc/feedGnuplot.1                                  |  190 +++
 doc/likwid-accessD.1                               |   22 +
 doc/likwid-bench.1                                 |   31 +-
 doc/likwid-features.1                              |   14 +-
 doc/likwid-genCfg.1                                |   30 +
 doc/likwid-memsweeper.1                            |   28 +
 doc/likwid-mpirun.1                                |   81 ++
 doc/likwid-perfctr.1                               |  100 +-
 doc/likwid-perfscope.1                             |   55 +
 doc/likwid-pin.1                                   |   24 +-
 doc/likwid-powermeter.1                            |   17 +-
 doc/likwid-setFreq.1                               |   24 +
 doc/likwid-setFrequencies.1                        |   16 +-
 doc/likwid-topology.1                              |   14 +-
 filters/csv                                        |    5 +-
 filters/xml                                        |   27 +-
 groups/core2/BRANCH.txt                            |    2 +
 groups/core2/CACHE.txt                             |    2 +
 groups/core2/DATA.txt                              |    2 +
 groups/core2/FLOPS_DP.txt                          |    2 +
 groups/core2/FLOPS_SP.txt                          |    2 +
 groups/core2/FLOPS_X87.txt                         |    2 +
 groups/core2/L2.txt                                |    2 +
 groups/core2/L2CACHE.txt                           |    2 +
 groups/core2/MEM.txt                               |    2 +
 groups/core2/TLB.txt                               |    2 +
 groups/haswell/ENERGY.txt                          |   11 +-
 groups/haswell/ICACHE.txt                          |   25 +
 groups/haswell/L2.txt                              |   33 +
 groups/haswell/L2CACHE.txt                         |    6 +-
 groups/haswell/{L2CACHE.txt => L3CACHE.txt}        |   24 +-
 groups/haswell/TLB.txt                             |   22 -
 groups/haswell/TLB_DATA.txt                        |   35 +
 groups/haswell/TLB_INSTR.txt                       |   28 +
 groups/ivybridge/ENERGY.txt                        |    4 +
 groups/ivybridge/FLOPS_AVX.txt                     |    4 +-
 groups/ivybridge/FLOPS_DP.txt                      |    2 +-
 groups/ivybridge/ICACHE.txt                        |   25 +
 .../{haswell/L2CACHE.txt => ivybridge/L3CACHE.txt} |   24 +-
 groups/ivybridge/MEM.txt                           |   20 +-
 groups/ivybridge/MEM_DP.txt                        |   24 +-
 groups/ivybridge/MEM_SP.txt                        |   22 +-
 groups/ivybridge/TLB.txt                           |   23 -
 groups/ivybridge/TLB_DATA.txt                      |   35 +
 groups/ivybridge/TLB_INSTR.txt                     |   28 +
 groups/sandybridge/ENERGY.txt                      |    6 +-
 groups/sandybridge/FLOPS_DP.txt                    |    2 +-
 .../L2CACHE.txt => sandybridge/L3CACHE.txt}        |   24 +-
 groups/sandybridge/MEM.txt                         |   20 +-
 groups/sandybridge/MEM_DP.txt                      |   22 +-
 groups/sandybridge/MEM_SP.txt                      |   22 +-
 groups/sandybridge/TLB.txt                         |   23 -
 groups/sandybridge/TLB_DATA.txt                    |   35 +
 groups/sandybridge/TLB_INSTR.txt                   |   28 +
 groups/{core2 => silvermont}/BRANCH.txt            |   17 +-
 groups/{haswell => silvermont}/ENERGY.txt          |    4 +-
 groups/silvermont/ICACHE.txt                       |   25 +
 groups/silvermont/L1TOL2.txt                       |   28 +
 groups/silvermont/L2TOMEM.txt                      |   26 +
 kernel/Makefile                                    |   12 +
 kernel/enable_rdpmc.c                              |   73 ++
 make/include_GCC.mk                                |    2 +-
 make/include_ICC.mk                                |    6 +-
 perl/generatePas.pl                                |    2 +-
 perl/likwid-mpirun                                 |   36 +-
 perl/likwid-setFrequencies                         |   41 +-
 perl/set_license.pl                                |    4 +-
 perl/templates/group.tt                            |   57 +-
 src/access-daemon/Makefile                         |    9 +-
 src/access-daemon/accessDaemon.c                   |  345 ++++--
 src/access-daemon/setFreq.c                        |  100 +-
 src/access-daemon/setFreq.c.tmp                    |    0
 src/accessClient.c                                 |   46 +-
 src/affinity.c                                     |  179 +--
 src/allocator.c                                    |   43 +-
 src/applications/likwid-bench.c                    |  353 +++---
 src/applications/likwid-features.c                 |   48 +-
 src/applications/likwid-genCfg.c                   |   32 +-
 src/applications/likwid-memsweeper.c               |   43 +-
 src/applications/likwid-perfctr.c                  |  166 ++-
 src/applications/likwid-pin.c                      |  150 ++-
 src/applications/likwid-powermeter.c               |  291 +++--
 src/applications/likwid-topology.c                 |   86 +-
 src/asciiBoxes.c                                   |    7 +-
 src/asciiTable.c                                   |    4 +-
 src/barrier.c                                      |    8 +-
 src/bench.c                                        |  122 +-
 src/bitUtil.c                                      |    4 +-
 src/cpuFeatures.c                                  |  274 ++--
 src/cpuid.c                                        |  108 +-
 src/daemon.c                                       |   94 +-
 src/ghash.c                                        |    1 -
 src/hashTable.c                                    |   10 +-
 src/includes/accessClient.h                        |    4 +-
 src/includes/accessClient_types.h                  |    4 +-
 src/includes/affinity.h                            |    6 +-
 src/includes/affinity_types.h                      |    6 +-
 src/includes/allocator.h                           |   17 +-
 src/includes/asciiBoxes.h                          |    6 +-
 src/includes/asciiBoxes_types.h                    |    4 +-
 src/includes/asciiTable.h                          |    4 +-
 src/includes/asciiTable_types.h                    |    4 +-
 src/includes/barrier.h                             |    4 +-
 src/includes/barrier_types.h                       |    4 +-
 src/includes/bitUtil.h                             |    4 +-
 src/includes/cpuFeatures.h                         |    4 +-
 src/includes/cpuFeatures_types.h                   |   38 +-
 src/includes/cpuid.h                               |   11 +-
 src/includes/cpuid_types.h                         |    5 +-
 src/includes/daemon.h                              |    6 +-
 src/includes/error.h                               |    4 +-
 src/includes/ghash.h                               |   14 +-
 src/includes/hashTable.h                           |    4 +-
 src/includes/libperfctr_types.h                    |    4 +-
 src/includes/likwid.h                              |    4 +-
 src/includes/lock.h                                |   56 +-
 src/includes/memsweep.h                            |   10 +-
 src/includes/msr.h                                 |    4 +-
 src/includes/multiplex.h                           |    4 +-
 src/includes/multiplex_types.h                     |    4 +-
 src/includes/numa.h                                |    4 +-
 src/includes/numa_types.h                          |   10 +-
 src/includes/pci.h                                 |    4 +-
 src/includes/pci_types.h                           |    4 +-
 src/includes/perfmon.h                             |   49 +-
 src/includes/perfmon_atom.h                        |    4 +-
 src/includes/perfmon_atom_events.txt               |    4 +-
 src/includes/perfmon_core2.h                       |   13 +-
 src/includes/perfmon_core2_counters.h              |   13 +-
 src/includes/perfmon_core2_events.txt              |    9 +-
 src/includes/perfmon_haswell.h                     |  156 ++-
 src/includes/perfmon_haswell_counters.h            |   13 +-
 src/includes/perfmon_haswell_events.txt            |  148 +--
 src/includes/perfmon_interlagos.h                  |   14 +-
 src/includes/perfmon_interlagos_counters.h         |    4 +-
 src/includes/perfmon_interlagos_events.txt         |    4 +-
 src/includes/perfmon_ivybridge.h                   |   84 +-
 src/includes/perfmon_ivybridge_counters.h          |  103 +-
 src/includes/perfmon_ivybridge_events.txt          |  210 +---
 src/includes/perfmon_k10.h                         |   13 +-
 src/includes/perfmon_k10_counters.h                |    4 +-
 src/includes/perfmon_k10_events.txt                |    4 +-
 src/includes/perfmon_k8.h                          |    4 +-
 src/includes/perfmon_k8_events.txt                 |    4 +-
 src/includes/perfmon_kabini.h                      |   18 +-
 src/includes/perfmon_kabini_counters.h             |    4 +-
 src/includes/perfmon_kabini_events.txt             |    4 +-
 src/includes/perfmon_nehalem.h                     |   44 +-
 src/includes/perfmon_nehalemEX.h                   |  602 ++++++++-
 src/includes/perfmon_nehalemEX_events.txt          |  619 +++++++++-
 src/includes/perfmon_nehalem_counters.h            |    4 +-
 src/includes/perfmon_nehalem_events.txt            |    4 +-
 src/includes/perfmon_p6_events.txt                 |    4 +-
 src/includes/perfmon_phi.h                         |   11 +-
 src/includes/perfmon_phi_counters.h                |    4 +-
 src/includes/perfmon_phi_events.txt                |    4 +-
 src/includes/perfmon_pm.h                          |   17 +-
 src/includes/perfmon_pm_counters.h                 |    4 +-
 src/includes/perfmon_pm_events.txt                 |    4 +-
 src/includes/perfmon_sandybridge.h                 |   47 +-
 src/includes/perfmon_sandybridge_counters.h        |   93 +-
 src/includes/perfmon_sandybridge_events.txt        |  695 +----------
 .../{perfmon_haswell.h => perfmon_silvermont.h}    |  161 ++-
 ...ll_counters.h => perfmon_silvermont_counters.h} |   26 +-
 src/includes/perfmon_silvermont_events.txt         |  440 +++++++
 src/includes/perfmon_types.h                       |   44 +-
 src/includes/perfmon_westmere.h                    |    4 +-
 src/includes/perfmon_westmereEX.h                  |  522 +++++---
 src/includes/perfmon_westmereEX_counters.h         |   72 +-
 src/includes/perfmon_westmereEX_events.txt         |  637 +++++++++-
 src/includes/perfmon_westmere_events.txt           |    4 +-
 src/includes/power.h                               |    4 +-
 src/includes/power_types.h                         |    4 +-
 src/includes/registers.h                           |   84 +-
 src/includes/strUtil.h                             |    6 +-
 src/includes/strUtil_types.h                       |    4 +-
 src/includes/test_types.h                          |   22 +-
 src/includes/textcolor.h                           |    4 +-
 src/includes/thermal.h                             |    9 +-
 src/includes/thermal_types.h                       |    5 +-
 src/includes/threads.h                             |    7 +-
 src/includes/threads_types.h                       |   26 +-
 src/includes/timer.h                               |   70 +-
 src/includes/timer_types.h                         |    4 +-
 src/includes/tree.h                                |    4 +-
 src/includes/tree_types.h                          |    4 +-
 src/includes/types.h                               |    4 +-
 src/libperfctr.c                                   |   79 +-
 src/likwid.f90                                     |   28 +-
 src/likwid_f90_interface.c                         |    8 +-
 src/memsweep.c                                     |   40 +-
 src/msr.c                                          |  168 ++-
 src/multiplex.c                                    |    4 +-
 src/numa.c                                         |   38 +-
 src/pci.c                                          |  159 ++-
 src/perfmon.c                                      |  255 +++-
 src/power.c                                        |   54 +-
 src/pthread-overload/Makefile                      |    4 +-
 src/pthread-overload/pthread-overload.c            |    8 +-
 src/strUtil.c                                      | 1305 ++++++++++----------
 src/thermal.c                                      |    5 +-
 src/threads.c                                      |   13 +-
 src/timer.c                                        |   28 +-
 src/tree.c                                         |   44 +-
 test/accuracy/Makefile                             |   25 +
 test/accuracy/README                               |   18 +
 test/accuracy/likwid-accuracy.py                   |  533 ++++++++
 test/accuracy/likwid-tester                        |    4 +-
 test/accuracy/statistics.py                        |  643 ++++++++++
 test/executable_tests/Makefile                     |   22 +
 test/executable_tests/README                       |    8 +
 test/executable_tests/likwid-bench.txt             |   29 +
 test/executable_tests/likwid-features.txt          |    9 +
 test/executable_tests/likwid-genCfg.txt            |    5 +
 test/executable_tests/likwid-memsweeper.txt        |    8 +
 test/executable_tests/likwid-perfctr.txt           |   38 +
 test/executable_tests/likwid-pin.txt               |   26 +
 test/executable_tests/likwid-powermeter.txt        |   14 +
 test/executable_tests/likwid-setFreq.txt           |    6 +
 test/executable_tests/likwid-topology.txt          |   11 +
 test/executable_tests/tester.sh                    |   80 ++
 260 files changed, 10026 insertions(+), 4091 deletions(-)

diff --git a/INSTALL b/INSTALL
index 4742591..5939aa9 100644
--- a/INSTALL
+++ b/INSTALL
@@ -20,7 +20,7 @@ the WIKI. On 32bit systems you have to pick the GCCX86 compiler target.
 All generated files are located in the [GCC|ICC|GCCX86] build directory.
 This includes the dependency files, object files and also the
 generated source files and the pas and assembly files for likwid-bench.
-If you debug your likwid-bench benchmarks you can look at all 
+If you debug your likwid-bench benchmarks you can look at all
 intermediate build files and also the final assembly code.
 
 == Known problems ==
@@ -60,7 +60,7 @@ Check if msr device files are there with 'ls /dev/cpu/0/'. If msr device files a
 consult your distros documentation how to do so.
 
 Once you have the msr device files avilable:
-3. Adopt access rights on the msr device files for normal user. To allow everybody access you can 
+3. Adopt access rights on the msr device files for normal user. To allow everybody access you can
 use 'chmod o+rw /dev/cpu/*/msr' . This is only recommended on save single user desktop systems.
 
 As a general access to the msr registers is not desired on security sensitive
@@ -79,7 +79,11 @@ page:
 http://code.google.com/p/likwid/wiki/MSRDaemon
 
 A common solution to give access is to use the likwid-accessD and make it suid root.
-You need to carry out the following steps:
+Starting with version 3.1.3 make install will do those steps. Of course this will only
+work as long as you are root while calling make install.
+
+If for you are not root and someone else needs to install the daemon the
+following steps need to be carried out:
 
 1. Go to the directory where you installed the likwid tools.
 2. Change to the sbin directory there.
diff --git a/Makefile b/Makefile
index de85f13..eecd4e9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,12 @@
 # =======================================================================================
-#  
+#
 #      Filename:  Makefile
-# 
+#
 #      Description:  Central Makefile
-# 
-#      Version:   3.1.2
-#      Released:  2.6.2014
-# 
+#
+#      Version:   3.1.3
+#      Released:  4.11.2014
+#
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
 #
@@ -87,11 +87,14 @@ endif
 
 LIKWID_LIB = liblikwid
 ifeq ($(SHARED_LIBRARY),true)
-CFLAGS += $(SHARED_CFLAGS)
+CFLAGS += $(SHARED_CFLAGS) -ggdb
 DYNAMIC_TARGET_LIB := $(LIKWID_LIB).so
+TARGET_LIB := $(DYNAMIC_TARGET_LIB)
 LIBS += -L. -llikwid
+SHARED_LFLAGS += -lm -lpthread
 else
 STATIC_TARGET_LIB := $(LIKWID_LIB).a
+TARGET_LIB := $(STATIC_TARGET_LIB)
 endif
 
 ifneq ($(COLOR),NONE)
@@ -155,16 +158,17 @@ PERFMONHEADERS  = $(patsubst $(SRC_DIR)/includes/%.txt, $(BUILD_DIR)/%.h,$(wildc
 OBJ_BENCH  =  $(patsubst $(BENCH_DIR)/%.ptt, $(BUILD_DIR)/%.o,$(wildcard $(BENCH_DIR)/*.ptt))
 
 APPS      = likwid-perfctr    \
-		likwid-features   \
-		likwid-powermeter \
-		likwid-memsweeper \
-		likwid-topology   \
-		likwid-genCfg     \
-		likwid-pin        \
-		likwid-bench
-
-PERL_APPS	= likwid-mpirun \
-			likwid-perfscope
+            likwid-features   \
+            likwid-powermeter \
+            likwid-memsweeper \
+            likwid-topology   \
+            likwid-genCfg     \
+            likwid-pin        \
+            likwid-bench
+
+PERL_APPS = likwid-mpirun         \
+            likwid-setFrequencies \
+            likwid-perfscope
 
 DAEMON_APPS = $(SETFREQ_TARGET) \
 			$(DAEMON_TARGET)
@@ -185,7 +189,7 @@ FORTRAN_INTERFACE =
 FORTRAN_INSTALL =
 endif
 
-all: $(BUILD_DIR) $(GENGROUPLOCK) $(PERFMONHEADERS) $(OBJ) $(OBJ_BENCH) $(STATIC_TARGET_LIB) $(DYNAMIC_TARGET_LIB) $(APPS) $(FORTRAN_INTERFACE)  $(PINLIB)  $(DAEMON_TARGET) $(SETFREQ_TARGET)
+all: $(BUILD_DIR) $(GENGROUPLOCK) $(PERFMONHEADERS) $(OBJ) $(OBJ_BENCH) $(TARGET_LIB) $(APPS) $(FORTRAN_INTERFACE)  $(PINLIB)  $(DAEMON_TARGET) $(SETFREQ_TARGET)
 
 tags:
 	@echo "===>  GENERATE  TAGS"
@@ -197,11 +201,11 @@ $(APPS):  $(addprefix $(SRC_DIR)/applications/,$(addsuffix  .c,$(APPS))) $(BUILD
 
 $(STATIC_TARGET_LIB): $(OBJ)
 	@echo "===>  CREATE STATIC LIB  $(STATIC_TARGET_LIB)"
-	$(Q)${AR} -cq $(STATIC_TARGET_LIB) $(OBJ)
+	$(Q)${AR} -crus $(STATIC_TARGET_LIB) $(OBJ)
 
 $(DYNAMIC_TARGET_LIB): $(OBJ)
 	@echo "===>  CREATE SHARED LIB  $(DYNAMIC_TARGET_LIB)"
-	$(Q)${CC} $(SHARED_LFLAGS) $(SHARED_CFLAGS) -o $(DYNAMIC_TARGET_LIB) $(OBJ) -lm
+	$(Q)${CC} $(SHARED_CFLAGS) -o $(DYNAMIC_TARGET_LIB) $(OBJ) -lm $(SHARED_LFLAGS)
 
 $(DAEMON_TARGET): $(SRC_DIR)/access-daemon/accessDaemon.c
 	@echo "===>  Build access daemon $(DAEMON_TARGET)"
@@ -216,7 +220,7 @@ $(BUILD_DIR):
 
 $(PINLIB):
 	@echo "===>  CREATE LIB  $(PINLIB)"
-	$(Q)$(MAKE) -s -C src/pthread-overload/ $(PINLIB) 
+	$(Q)$(MAKE) -s -C src/pthread-overload/ $(PINLIB)
 
 $(GENGROUPLOCK): $(foreach directory,$(shell ls $(GROUP_DIR)), $(wildcard $(GROUP_DIR)/$(directory)/*.txt))
 	@echo "===>  GENERATE GROUP HEADERS"
@@ -288,17 +292,21 @@ install:
 		cp -f $$app $(PREFIX)/bin; \
 	done
 	@cp -f perl/feedGnuplot  $(PREFIX)/bin
-	@sed -e "s+<PREFIX>+$(PREFIX)+g" perl/likwid-setFrequencies > $(PREFIX)/bin/likwid-setFrequencies
 	@for app in $(PERL_APPS); do \
-		cp -f perl/$$app $(PREFIX)/bin; \
+		sed -e "s+<PREFIX>+$(PREFIX)+g" perl/$$app > $(PREFIX)/bin/$$app; \
 	done
 	@chmod 755 $(PREFIX)/bin/likwid-*
-	@echo "===> INSTALL daemon applications to $(PREFIX)/bin"
+	@echo "===> INSTALL daemon applications to $(PREFIX)/sbin"
 	@mkdir -p $(PREFIX)/sbin
 	@for app in $(DAEMON_APPS); do \
 		cp -f $$app $(PREFIX)/sbin; \
+		if [ $(shell id -u) = "0" ]; then \
+			chown root $(PREFIX)/sbin/$$app; \
+			chmod 4775 $(PREFIX)/sbin/$$app; \
+		else \
+			echo "Only root can adjust the privileges of the daemon applications in $(PREFIX)/sbin"; \
+		fi; \
 	done
-	@chmod 755 $(PREFIX)/sbin/likwid-*
 	@echo "===> INSTALL man pages to $(MANPREFIX)/man1"
 	@mkdir -p $(MANPREFIX)/man1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-topology.1 > $(MANPREFIX)/man1/likwid-topology.1
@@ -308,10 +316,19 @@ install:
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-pin.1 > $(MANPREFIX)/man1/likwid-pin.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFrequencies.1 > $(MANPREFIX)/man1/likwid-setFrequencies.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-bench.1 > $(MANPREFIX)/man1/likwid-bench.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/feedGnuplot.1 > $(MANPREFIX)/man1/feedGnuplot.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-accessD.1 > $(MANPREFIX)/man1/likwid-accessD.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-genCfg.1 > $(MANPREFIX)/man1/likwid-genCfg.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-memsweeper.1 > $(MANPREFIX)/man1/likwid-memsweeper.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-mpirun.1 > $(MANPREFIX)/man1/likwid-mpirun.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-perfscope.1 > $(MANPREFIX)/man1/likwid-perfscope.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFreq.1 > $(MANPREFIX)/man1/likwid-setFreq.1
 	@chmod 644 $(MANPREFIX)/man1/likwid-*
 	@echo "===> INSTALL headers to $(PREFIX)/include"
-	@mkdir -p $(PREFIX)/include
+	@mkdir -p $(PREFIX)/include/likwid
 	@cp -f src/includes/likwid*.h  $(PREFIX)/include/
+	@cp -f src/includes/*  $(PREFIX)/include/likwid
+	@cp -f GCC/perfmon_group_types.h  $(PREFIX)/include/likwid
 	$(FORTRAN_INSTALL)
 	@echo "===> INSTALL libraries to $(PREFIX)/lib"
 	@mkdir -p $(PREFIX)/lib
@@ -321,23 +338,22 @@ install:
 	@mkdir -p $(LIKWIDFILTERPATH)
 	@cp -f filters/*  $(LIKWIDFILTERPATH)
 	@chmod 755 $(LIKWIDFILTERPATH)/*
-	@echo 
-	@echo "Please set suitable permissions and capabilities\nfor the daemon applications in $(PREFIX)/sbin"
 
 uninstall:
 	@echo "===> REMOVING applications from $(PREFIX)/bin"
-	@rm -f $(addprefix $(PREFIX)/bin/,$(APPS)) 
+	@rm -f $(addprefix $(PREFIX)/bin/,$(APPS))
 	@rm -f $(addprefix $(PREFIX)/bin/,$(PERL_APPS))
-	@rm -f $(PREFIX)/bin/likwid-setFrequencies
 	@rm -f $(PREFIX)/bin/feedGnuplot
 	@echo "===> REMOVING daemon applications from $(PREFIX)/sbin"
-	@rm -f $(addprefix $(PREFIX)/sbin/,$(DAEMON_APPS)) 
+	@rm -f $(addprefix $(PREFIX)/sbin/,$(DAEMON_APPS))
 	@echo "===> REMOVING man pages from $(MANPREFIX)/man1"
-	@rm -f $(addprefix $(MANPREFIX)/man1/,$(addsuffix  .1,$(APPS)))
+	@rm -f $(MANPREFIX)/man1/likwid-*
+	@rm -f $(MANPREFIX)/man1/feedGnuplot.1
 	@echo "===> REMOVING headers from $(PREFIX)/include"
 	@rm -f $(PREFIX)/include/likwid*.h
+	@rm -rf $(PREFIX)/include/likwid
 	@echo "===> REMOVING libs from $(PREFIX)/lib"
-	@rm -f $(PREFIX)/lib/$(LIKWID_LIB)* 
+	@rm -f $(PREFIX)/lib/$(LIKWID_LIB)*
 	@echo "===> REMOVING filter from $(PREFIX)/share"
 	@rm -rf  $(PREFIX)/share/likwid
 
diff --git a/README b/README
index 7e3e466..f47ac01 100644
--- a/README
+++ b/README
@@ -21,7 +21,7 @@ likwid wiki pages at:
 
 http://code.google.com/p/likwid/wiki/Introduction
 
-If you have problems or suggestions please let me know on the likwid mailing list:
+If you have problems or suggestions please let us know on the likwid mailing list:
 
 http://groups.google.com/group/likwid-users
 
diff --git a/bench/x86-64/branch.ptt b/bench/x86-64/branch.ptt
new file mode 100644
index 0000000..e15086d
--- /dev/null
+++ b/bench/x86-64/branch.ptt
@@ -0,0 +1,36 @@
+STREAMS 4
+TYPE DOUBLE_RAND
+FLOPS 2
+BYTES 32
+LOOP 8
+movaps    FPR1, [STR1 + GPR1*8]
+movaps    FPR2, [STR1 + GPR1*8+16]
+movaps    FPR3, [STR1 + GPR1*8+32]
+movaps    FPR4, [STR1 + GPR1*8+48]
+cvtsd2si  GPR2, FPR1
+cmp		  GPR2, 0
+jl sub
+mulpd     FPR1, [STR2 + GPR1*8]
+addpd     FPR1, [STR3 + GPR1*8]
+mulpd     FPR2, [STR2 + GPR1*8+16]
+addpd     FPR2, [STR3 + GPR1*8+16]
+mulpd     FPR3, [STR2 + GPR1*8+32]
+addpd     FPR3, [STR3 + GPR1*8+32]
+mulpd     FPR4, [STR2 + GPR1*8+48]
+addpd     FPR4, [STR3 + GPR1*8+48]
+jmp end
+sub:
+mulpd     FPR1, [STR2 + GPR1*8]
+subpd     FPR1, [STR3 + GPR1*8]
+mulpd     FPR2, [STR2 + GPR1*8+16]
+subpd     FPR2, [STR3 + GPR1*8+16]
+mulpd     FPR3, [STR2 + GPR1*8+32]
+subpd     FPR3, [STR3 + GPR1*8+32]
+mulpd     FPR4, [STR2 + GPR1*8+48]
+subpd     FPR4, [STR3 + GPR1*8+48]
+end:
+movaps    [STR0 + GPR1*8], FPR1
+movaps    [STR0 + GPR1*8+16], FPR2
+movaps    [STR0 + GPR1*8+32], FPR3
+movaps    [STR0 + GPR1*8+48], FPR4
+
diff --git a/bench/x86-64/copy_avx.ptt b/bench/x86-64/copy_avx.ptt
new file mode 100644
index 0000000..814bb78
--- /dev/null
+++ b/bench/x86-64/copy_avx.ptt
@@ -0,0 +1,15 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 0
+BYTES 16
+LOOP 16
+vmovaps    ymm1, [STR0 + GPR1 * 8]
+vmovaps    ymm2, [STR0 + GPR1 * 8 + 32]
+vmovaps    ymm3, [STR0 + GPR1 * 8 + 64]
+vmovaps    ymm4, [STR0 + GPR1 * 8 + 96]
+vmovaps    [STR1 + GPR1 * 8]     , ymm1
+vmovaps    [STR1 + GPR1 * 8 + 32], ymm2
+vmovaps    [STR1 + GPR1 * 8 + 64], ymm3
+vmovaps    [STR1 + GPR1 * 8 + 96], ymm4
+
+
diff --git a/bench/x86-64/copy_mem_avx.ptt b/bench/x86-64/copy_mem_avx.ptt
new file mode 100644
index 0000000..651a55e
--- /dev/null
+++ b/bench/x86-64/copy_mem_avx.ptt
@@ -0,0 +1,14 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 0
+BYTES 16
+LOOP 32
+vmovaps    ymm1, [STR0 + GPR1 * 8]
+vmovaps    ymm2, [STR0 + GPR1 * 8 + 32]
+vmovaps    ymm3, [STR0 + GPR1 * 8 + 64]
+vmovaps    ymm4, [STR0 + GPR1 * 8 + 96]
+vmovntps   [STR1 + GPR1 * 8]     , ymm1
+vmovntps   [STR1 + GPR1 * 8 + 32], ymm2
+vmovntps   [STR1 + GPR1 * 8 + 64], ymm3
+vmovntps   [STR1 + GPR1 * 8 + 96], ymm4
+
diff --git a/bench/x86-64/copy_mem_sse.ptt b/bench/x86-64/copy_mem_sse.ptt
new file mode 100644
index 0000000..f803bce
--- /dev/null
+++ b/bench/x86-64/copy_mem_sse.ptt
@@ -0,0 +1,15 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 0
+BYTES 16
+LOOP 8
+movaps    FPR1, [STR0 + GPR1 * 8]
+movaps    FPR2, [STR0 + GPR1 * 8 + 16]
+movaps    FPR3, [STR0 + GPR1 * 8 + 32]
+movaps    FPR4, [STR0 + GPR1 * 8 + 48]
+movntps   [STR1 + GPR1 * 8]     , FPR1
+movntps   [STR1 + GPR1 * 8 + 16], FPR2
+movntps   [STR1 + GPR1 * 8 + 32], FPR3
+movntps   [STR1 + GPR1 * 8 + 48], FPR4
+
+
diff --git a/bench/x86-64/copy_plain.ptt b/bench/x86-64/copy_plain.ptt
new file mode 100644
index 0000000..4fcbbbc
--- /dev/null
+++ b/bench/x86-64/copy_plain.ptt
@@ -0,0 +1,16 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 0
+BYTES 16
+LOOP 4
+movsd    FPR1, [STR0 + GPR1 * 8]
+movsd    FPR2, [STR0 + GPR1 * 8 + 8]
+movsd    FPR3, [STR0 + GPR1 * 8 + 16]
+movsd    FPR4, [STR0 + GPR1 * 8 + 24]
+movsd    [STR1 + GPR1 * 8]     , FPR1
+movsd    [STR1 + GPR1 * 8 + 8] , FPR2
+movsd    [STR1 + GPR1 * 8 + 16], FPR3
+movsd    [STR1 + GPR1 * 8 + 24], FPR4
+
+
+
diff --git a/bench/x86-64/copy_sse.ptt b/bench/x86-64/copy_sse.ptt
new file mode 100644
index 0000000..ffca4f5
--- /dev/null
+++ b/bench/x86-64/copy_sse.ptt
@@ -0,0 +1,15 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 0
+BYTES 16
+LOOP 8
+movaps    FPR1, [STR0 + GPR1 * 8]
+movaps    FPR2, [STR0 + GPR1 * 8 + 16]
+movaps    FPR3, [STR0 + GPR1 * 8 + 32]
+movaps    FPR4, [STR0 + GPR1 * 8 + 48]
+movaps    [STR1 + GPR1 * 8]     , FPR1
+movaps    [STR1 + GPR1 * 8 + 16], FPR2
+movaps    [STR1 + GPR1 * 8 + 32], FPR3
+movaps    [STR1 + GPR1 * 8 + 48], FPR4
+
+
diff --git a/bench/x86-64/load_avx.ptt b/bench/x86-64/load_avx.ptt
new file mode 100644
index 0000000..93b45c7
--- /dev/null
+++ b/bench/x86-64/load_avx.ptt
@@ -0,0 +1,12 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 8
+LOOP 16
+mov        GPR12, [STR0 + GPR1 * 8 + 256]
+vmovaps    ymm1, [STR0 + GPR1 * 8]
+vmovaps    ymm2, [STR0 + GPR1 * 8 + 32]
+vmovaps    ymm3, [STR0 + GPR1 * 8 + 64]
+vmovaps    ymm4, [STR0 + GPR1 * 8 + 96]
+
+
diff --git a/bench/x86-64/load_plain.ptt b/bench/x86-64/load_plain.ptt
new file mode 100644
index 0000000..be6d21c
--- /dev/null
+++ b/bench/x86-64/load_plain.ptt
@@ -0,0 +1,12 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 8
+LOOP 4
+mov       GPR12, [STR0 + GPR1 * 8 + 256]
+movsd    FPR1, [STR0 + GPR1 * 8]
+movsd    FPR2, [STR0 + GPR1 * 8 + 8]
+movsd    FPR3, [STR0 + GPR1 * 8 + 16]
+movsd    FPR4, [STR0 + GPR1 * 8 + 24]
+
+
diff --git a/bench/x86-64/load_sse.ptt b/bench/x86-64/load_sse.ptt
new file mode 100644
index 0000000..36aaab1
--- /dev/null
+++ b/bench/x86-64/load_sse.ptt
@@ -0,0 +1,12 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 8
+LOOP 8
+mov       GPR12, [STR0 + GPR1 * 8 + 256]
+movaps    FPR1, [STR0 + GPR1 * 8]
+movaps    FPR2, [STR0 + GPR1 * 8 + 16]
+movaps    FPR3, [STR0 + GPR1 * 8 + 32]
+movaps    FPR4, [STR0 + GPR1 * 8 + 48]
+
+
diff --git a/bench/x86-64/peak_avx.ptt b/bench/x86-64/peak_avx.ptt
new file mode 100644
index 0000000..047178e
--- /dev/null
+++ b/bench/x86-64/peak_avx.ptt
@@ -0,0 +1,49 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+INC 16
+vmovaps ymm1, [SCALAR]
+sub  GPR2, 8
+sub  STR0, 64
+sub  STR1, 64
+mov   GPR1, GPR2
+neg   GPR1
+.align 32
+1:
+vmovaps    ymm2, [STR0 + GPR1 * 8 ]
+vaddpd     ymm2, ymm2, ymm1
+vmulpd     ymm2, ymm2, ymm1
+vmovaps    ymm6, [STR0 + GPR1 * 8 ]
+vaddpd     ymm2, ymm2, ymm1
+vmulpd     ymm2, ymm2, ymm1
+#vpshufd    ymm2, ymm1, 0x1
+vmovaps    [STR1 + GPR1 * 8], ymm2
+vmovaps    ymm3, [STR0 + GPR1 * 8 + 32]
+vaddpd     ymm3, ymm3, ymm1
+vmulpd     ymm3, ymm3, ymm1
+vmovaps    ymm7, [STR0 + GPR1 * 8 + 32 ]
+vaddpd     ymm3, ymm3, ymm1
+vmulpd     ymm3, ymm3, ymm1
+#vpshufd    ymm3, ymm1, 0x1
+vmovaps    [STR1 + GPR1 * 8 + 32], ymm3
+vmovaps    ymm4, [STR0 + GPR1 * 8 + 64]
+vaddpd     ymm4, ymm4, ymm1
+vmulpd     ymm4, ymm4, ymm1
+vmovaps    ymm8, [STR0 + GPR1 * 8 + 64 ]
+vaddpd     ymm4, ymm4, ymm1
+vmulpd     ymm4, ymm4, ymm1
+#vpshufd    ymm4, ymm1, 0x1
+vmovaps    [STR1 + GPR1 * 8 + 32], ymm4
+vmovaps    ymm5, [STR0 + GPR1 * 8 + 96]
+vaddpd     ymm5, ymm5, ymm1
+vmulpd     ymm5, ymm5, ymm1
+vmovaps    ymm9, [STR0 + GPR1 * 8 + 96]
+vaddpd     ymm5, ymm5, ymm1
+vmulpd     ymm5, ymm5, ymm1
+#vpshufd    ymm5, ymm1, 0x1
+vmovaps    [STR1 + GPR1 * 8 + 96], ymm5
+add GPR1, 16
+js 1b
+
+
diff --git a/bench/x86-64/peak_sse.ptt b/bench/x86-64/peak_sse.ptt
new file mode 100644
index 0000000..c03e2c8
--- /dev/null
+++ b/bench/x86-64/peak_sse.ptt
@@ -0,0 +1,49 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+INC 8
+movaps FPR1, [SCALAR]
+sub  GPR2, 4
+sub  STR0, 32
+sub  STR1, 32
+mov   GPR1, GPR2
+neg   GPR1
+.align 16
+1:
+movaps    FPR2, [STR0 + GPR1 * 8 ]
+addpd     FPR2, FPR1
+mulpd     FPR2, FPR1
+movaps    FPR6, [STR0 + GPR1 * 8 ]
+addpd     FPR2, FPR1
+mulpd     FPR2, FPR1
+pshufd    FPR2, FPR1, 0x1
+#movaps    [STR1 + GPR1 * 8], FPR2
+movaps    FPR3, [STR0 + GPR1 * 8 + 16]
+addpd     FPR3, FPR1
+mulpd     FPR3, FPR1
+movaps    FPR7, [STR0 + GPR1 * 8 + 16 ]
+addpd     FPR3, FPR1
+mulpd     FPR3, FPR1
+pshufd    FPR3, FPR1, 0x1
+#movaps    [STR1 + GPR1 * 8 + 16], FPR3
+movaps    FPR4, [STR0 + GPR1 * 8 + 32]
+addpd     FPR4, FPR1
+mulpd     FPR4, FPR1
+movaps    FPR8, [STR0 + GPR1 * 8 + 32 ]
+addpd     FPR4, FPR1
+mulpd     FPR4, FPR1
+pshufd    FPR4, FPR1, 0x1
+#movaps    [STR1 + GPR1 * 8 + 32], FPR4
+movaps    FPR5, [STR0 + GPR1 * 8 + 48]
+addpd     FPR5, FPR1
+mulpd     FPR5, FPR1
+movaps    FPR9, [STR0 + GPR1 * 8 + 48 ]
+addpd     FPR5, FPR1
+mulpd     FPR5, FPR1
+pshufd    FPR5, FPR1, 0x1
+#movaps    [STR1 + GPR1 * 8 + 48], FPR5
+add GPR1, 8
+js 1b
+
+
diff --git a/bench/x86-64/peakflops_avx.ptt b/bench/x86-64/peakflops_avx.ptt
new file mode 100644
index 0000000..d9f9885
--- /dev/null
+++ b/bench/x86-64/peakflops_avx.ptt
@@ -0,0 +1,37 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+INC 16
+vmovaps ymm1, [SCALAR]
+sub  GPR2, 8
+sub  STR0, 64
+sub  STR1, 64
+mov   GPR1, GPR2
+neg   GPR1
+.align 32
+1:
+vmovaps    ymm2, [STR0 + GPR1 * 8 ]
+vaddpd     ymm2, ymm2, ymm1
+vmulpd     ymm2, ymm2, ymm1
+vaddpd     ymm2, ymm2, ymm1
+vmulpd     ymm2, ymm2, ymm1
+vmovaps    ymm3, [STR0 + GPR1 * 8 + 32]
+add GPR1, 16
+vaddpd     ymm3, ymm3, ymm1
+vmulpd     ymm3, ymm3, ymm1
+vaddpd     ymm3, ymm3, ymm1
+vmulpd     ymm3, ymm3, ymm1
+vmovaps    ymm4, [STR0 + GPR1 * 8 - 64]
+vaddpd     ymm4, ymm4, ymm1
+vmulpd     ymm4, ymm4, ymm1
+vaddpd     ymm4, ymm4, ymm1
+vmulpd     ymm4, ymm4, ymm1
+vmovaps    ymm5, [STR0 + GPR1 * 8 - 32]
+vaddpd     ymm5, ymm5, ymm1
+vmulpd     ymm5, ymm5, ymm1
+vaddpd     ymm5, ymm5, ymm1
+vmulpd     ymm5, ymm5, ymm1
+js 1b
+
+
diff --git a/bench/x86-64/peakflops_sse.ptt b/bench/x86-64/peakflops_sse.ptt
new file mode 100644
index 0000000..94c769a
--- /dev/null
+++ b/bench/x86-64/peakflops_sse.ptt
@@ -0,0 +1,37 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+INC 8
+movaps FPR1, [SCALAR]
+sub  GPR2, 4
+sub  STR0, 32
+sub  STR1, 32
+mov   GPR1, GPR2
+neg   GPR1
+.align 32
+1:
+movaps    FPR2, [STR0 + GPR1 * 8 ]
+addpd     FPR2, FPR1
+mulpd     FPR2, FPR1
+addpd     FPR2, FPR1
+mulpd     FPR2, FPR1
+movaps    FPR3, [STR0 + GPR1 * 8 + 16]
+add GPR1, 8
+addpd     FPR3, FPR1
+mulpd     FPR3, FPR1
+addpd     FPR3, FPR1
+mulpd     FPR3, FPR1
+movaps    FPR4, [STR0 + GPR1 * 8 - 32]
+addpd     FPR4, FPR1
+mulpd     FPR4, FPR1
+addpd     FPR4, FPR1
+mulpd     FPR4, FPR1
+movaps    FPR5, [STR0 + GPR1 * 8 - 16]
+addpd     FPR5, FPR1
+mulpd     FPR5, FPR1
+addpd     FPR5, FPR1
+mulpd     FPR5, FPR1
+js 1b
+
+
diff --git a/bench/x86-64/store_avx.ptt b/bench/x86-64/store_avx.ptt
new file mode 100644
index 0000000..7b589a8
--- /dev/null
+++ b/bench/x86-64/store_avx.ptt
@@ -0,0 +1,15 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 8
+vmovaps ymm1, [SCALAR]
+vmovaps ymm2, [SCALAR]
+vmovaps ymm3, [SCALAR]
+vmovaps ymm4, [SCALAR]
+LOOP 16
+#mov       GPR14, [STR0 + GPR1 * 8 + 256] 
+vmovaps    [STR0 + GPR1 * 8]     , ymm1
+vmovaps    [STR0 + GPR1 * 8 + 32], ymm2
+vmovaps    [STR0 + GPR1 * 8 + 64], ymm3
+vmovaps    [STR0 + GPR1 * 8 + 96], ymm4
+
diff --git a/bench/x86-64/store_mem_avx.ptt b/bench/x86-64/store_mem_avx.ptt
new file mode 100644
index 0000000..e023fd0
--- /dev/null
+++ b/bench/x86-64/store_mem_avx.ptt
@@ -0,0 +1,14 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 8
+vmovaps ymm1, [SCALAR]
+vmovaps ymm2, [SCALAR]
+vmovaps ymm3, [SCALAR]
+vmovaps ymm4, [SCALAR]
+LOOP 16
+vmovntpd    [STR0 + GPR1 * 8]     , ymm1
+vmovntpd    [STR0 + GPR1 * 8 + 32], ymm2
+vmovntpd    [STR0 + GPR1 * 8 + 64], ymm3
+vmovntpd    [STR0 + GPR1 * 8 + 96], ymm4
+
diff --git a/bench/x86-64/store_mem_sse.ptt b/bench/x86-64/store_mem_sse.ptt
new file mode 100644
index 0000000..0a0222d
--- /dev/null
+++ b/bench/x86-64/store_mem_sse.ptt
@@ -0,0 +1,14 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 8
+movaps FPR1, [SCALAR]
+movaps FPR2, [SCALAR]
+movaps FPR3, [SCALAR]
+movaps FPR4, [SCALAR]
+LOOP 8
+movntpd    [STR0 + GPR1 * 8]     , FPR1
+movntpd    [STR0 + GPR1 * 8 + 16], FPR2
+movntpd    [STR0 + GPR1 * 8 + 32], FPR3
+movntpd    [STR0 + GPR1 * 8 + 48], FPR4
+
diff --git a/bench/x86-64/store_plain.ptt b/bench/x86-64/store_plain.ptt
new file mode 100644
index 0000000..0f667cd
--- /dev/null
+++ b/bench/x86-64/store_plain.ptt
@@ -0,0 +1,15 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 8
+movsd FPR1, [SCALAR]
+movsd FPR2, [SCALAR]
+movsd FPR3, [SCALAR]
+movsd FPR4, [SCALAR]
+LOOP 4
+#mov       GPR14, [STR0 + GPR1 * 8 + 256] 
+movsd    [STR0 + GPR1 * 8]     , FPR1
+movsd    [STR0 + GPR1 * 8 + 8], FPR2
+movsd    [STR0 + GPR1 * 8 + 16], FPR3
+movsd    [STR0 + GPR1 * 8 + 24], FPR4
+
diff --git a/bench/x86-64/store_sse.ptt b/bench/x86-64/store_sse.ptt
new file mode 100644
index 0000000..4ef9ab9
--- /dev/null
+++ b/bench/x86-64/store_sse.ptt
@@ -0,0 +1,15 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 8
+movaps FPR1, [SCALAR]
+movaps FPR2, [SCALAR]
+movaps FPR3, [SCALAR]
+movaps FPR4, [SCALAR]
+LOOP 8
+#mov       GPR14, [STR0 + GPR1 * 8 + 256] 
+movaps    [STR0 + GPR1 * 8]     , FPR1
+movaps    [STR0 + GPR1 * 8 + 16], FPR2
+movaps    [STR0 + GPR1 * 8 + 32], FPR3
+movaps    [STR0 + GPR1 * 8 + 48], FPR4
+
diff --git a/bench/x86-64/stream_avx.ptt b/bench/x86-64/stream_avx.ptt
new file mode 100644
index 0000000..8fbaf7c
--- /dev/null
+++ b/bench/x86-64/stream_avx.ptt
@@ -0,0 +1,22 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 4
+BYTES 48
+vbroadcastss ymm1, [SCALAR]
+LOOP 8
+vmovaps   ymm2, [STR1 + GPR1*8]
+vmovaps   ymm3, [STR1 + GPR1*8+16]
+vmovaps   ymm4, [STR1 + GPR1*8+32]
+vmovaps   ymm5, [STR1 + GPR1*8+48]
+vmulps    ymm2, ymm2, ymm1
+vaddps    ymm2, ymm2, [STR2 + GPR1*8]
+vmulps    ymm3, ymm3, ymm1
+vaddps    ymm3, ymm3, [STR2 + GPR1*8]
+vmulps    ymm4, ymm4, ymm1
+vaddps    ymm4, ymm4, [STR2 + GPR1*8]
+vmulps    ymm5, ymm5, ymm1
+vaddps    ymm5, ymm5, [STR2 + GPR1*8]
+vmovaps   [STR0 + GPR1*8], ymm2
+vmovaps   [STR0 + GPR1*8+16], ymm3
+vmovaps   [STR0 + GPR1*8+32], ymm4
+vmovaps   [STR0 + GPR1*8+48], ymm5
diff --git a/bench/x86-64/striad_avx.ptt b/bench/x86-64/striad_avx.ptt
new file mode 100644
index 0000000..b3c1317
--- /dev/null
+++ b/bench/x86-64/striad_avx.ptt
@@ -0,0 +1,23 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+vmovaps ymm5, [SCALAR]
+LOOP 16
+vmovaps    ymm1, [STR1 + GPR1*8]
+vmovaps    ymm2, [STR1 + GPR1*8+32]
+vmovaps    ymm3, [STR1 + GPR1*8+64]
+vmovaps    ymm4, [STR1 + GPR1*8+96]
+vmulpd     ymm1, ymm1, ymm5
+vaddpd     ymm1, ymm1, [STR2 + GPR1*8]
+vmulpd     ymm2, ymm2, ymm5
+vaddpd     ymm2, ymm2, [STR2 + GPR1*8+32]
+vmulpd     ymm3, ymm3, ymm5
+vaddpd     ymm3, ymm3, [STR2 + GPR1*8+64]
+vmulpd     ymm4, ymm4, ymm5
+vaddpd     ymm4, ymm4, [STR2 + GPR1*8+96]
+vmovaps    [STR0 + GPR1*8]   , ymm1
+vmovaps    [STR0 + GPR1*8+32], ymm2
+vmovaps    [STR0 + GPR1*8+64], ymm3
+vmovaps    [STR0 + GPR1*8+96], ymm4
+
diff --git a/bench/x86-64/striad_mem_avx.ptt b/bench/x86-64/striad_mem_avx.ptt
new file mode 100644
index 0000000..cef2688
--- /dev/null
+++ b/bench/x86-64/striad_mem_avx.ptt
@@ -0,0 +1,11 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+vmovaps ymm5, [SCALAR]
+LOOP 4
+vmovaps    ymm1, [STR2 + GPR1*8]
+vmulpd     ymm1, ymm1, ymm5
+vaddpd     ymm1, ymm1, [STR1 + GPR1*8]
+vmovntpd   [STR0 + GPR1*8], ymm1
+
diff --git a/bench/x86-64/striad_mem_sse.ptt b/bench/x86-64/striad_mem_sse.ptt
new file mode 100644
index 0000000..b8364cc
--- /dev/null
+++ b/bench/x86-64/striad_mem_sse.ptt
@@ -0,0 +1,11 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+movaps FPR5, [SCALAR]
+LOOP 2
+movaps    FPR1, [STR2 + GPR1*8]
+mulpd     FPR1, FPR5
+addpd     FPR1, [STR1 + GPR1*8]
+movntpd   [STR0 + GPR1*8], FPR1
+
diff --git a/bench/x86-64/striad_plain.ptt b/bench/x86-64/striad_plain.ptt
new file mode 100644
index 0000000..7b29664
--- /dev/null
+++ b/bench/x86-64/striad_plain.ptt
@@ -0,0 +1,23 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+movss FPR5, [SCALAR]
+LOOP 4
+movsd    FPR1, [STR1 + GPR1*8]
+movsd    FPR2, [STR1 + GPR1*8+8]
+movsd    FPR3, [STR1 + GPR1*8+16]
+movsd    FPR4, [STR1 + GPR1*8+24]
+mulsd    FPR1, FPR5
+addsd    FPR1, [STR2 + GPR1*8]
+mulsd    FPR2, FPR5
+addsd    FPR2, [STR2 + GPR1*8+8]
+mulsd    FPR3, FPR5
+addsd    FPR3, [STR2 + GPR1*8+16]
+mulsd    FPR4, FPR5
+addsd    FPR4, [STR2 + GPR1*8+24]
+movsd    [STR0 + GPR1*8]   , FPR1
+movsd    [STR0 + GPR1*8+8] , FPR2
+movsd    [STR0 + GPR1*8+16], FPR3
+movsd    [STR0 + GPR1*8+24], FPR4
+
diff --git a/bench/x86-64/striad_sse.ptt b/bench/x86-64/striad_sse.ptt
new file mode 100644
index 0000000..7c84c3c
--- /dev/null
+++ b/bench/x86-64/striad_sse.ptt
@@ -0,0 +1,23 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+movaps FPR5, [SCALAR]
+LOOP 8
+movaps    FPR1, [STR1 + GPR1*8]
+movaps    FPR2, [STR1 + GPR1*8+16]
+movaps    FPR3, [STR1 + GPR1*8+32]
+movaps    FPR4, [STR1 + GPR1*8+48]
+mulpd     FPR1, FPR5
+addpd     FPR1, [STR2 + GPR1*8]
+mulpd     FPR2, FPR5
+addpd     FPR2, [STR2 + GPR1*8+16]
+mulpd     FPR3, FPR5
+addpd     FPR3, [STR2 + GPR1*8+32]
+mulpd     FPR4, FPR5
+addpd     FPR4, [STR2 + GPR1*8+48]
+movaps    [STR0 + GPR1*8]   , FPR1
+movaps    [STR0 + GPR1*8+16], FPR2
+movaps    [STR0 + GPR1*8+32], FPR3
+movaps    [STR0 + GPR1*8+48], FPR4
+
diff --git a/bench/x86-64/sum_sse.ptt b/bench/x86-64/sum_sse.ptt
new file mode 100644
index 0000000..3e7a2bb
--- /dev/null
+++ b/bench/x86-64/sum_sse.ptt
@@ -0,0 +1,23 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 1
+BYTES 8
+xorpd FPR1, FPR1
+movapd FPR2, FPR1
+movapd FPR3, FPR1
+movapd FPR4, FPR1
+movapd FPR5, FPR1
+movapd FPR6, FPR1
+movapd FPR7, FPR1
+movapd FPR8, FPR1
+LOOP 16
+addpd    FPR1, [STR0 + GPR1 * 8]
+addpd    FPR2, [STR0 + GPR1 * 8 + 16]
+addpd    FPR3, [STR0 + GPR1 * 8 + 32]
+addpd    FPR4, [STR0 + GPR1 * 8 + 48]
+addpd    FPR5, [STR0 + GPR1 * 8 + 64]
+addpd    FPR6, [STR0 + GPR1 * 8 + 80]
+addpd    FPR7, [STR0 + GPR1 * 8 + 96]
+addpd    FPR8, [STR0 + GPR1 * 8 + 112]
+
+
diff --git a/bench/x86-64/triad_avx.ptt b/bench/x86-64/triad_avx.ptt
new file mode 100644
index 0000000..3514cfd
--- /dev/null
+++ b/bench/x86-64/triad_avx.ptt
@@ -0,0 +1,12 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+LOOP 32
+vmovapd ymm1, [STR1 + GPR1]
+vmovapd ymm2, [STR2 + GPR1]
+vmovapd ymm3, [STR3 + GPR1]
+vmulpd  ymm0, ymm1, ymm2
+vaddpd  ymm0, ymm0, ymm3
+vmovapd [STR0 + GPR1], ymm0
+
diff --git a/bench/x86-64/triad_split.ptt b/bench/x86-64/triad_split.ptt
new file mode 100644
index 0000000..7b30e47
--- /dev/null
+++ b/bench/x86-64/triad_split.ptt
@@ -0,0 +1,30 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+LOOP 8
+movapd    FPR1, [STR1 + GPR1*8]
+movapd    FPR2, [STR1 + GPR1*8+16]
+movapd    FPR3, [STR1 + GPR1*8+32]
+movapd    FPR4, [STR1 + GPR1*8+48]
+movapd    FPR5, [STR2 + GPR1*8]
+movapd    FPR6, [STR3 + GPR1*8]
+movapd    FPR7, [STR2 + GPR1*8+16]
+movapd    FPR8, [STR3 + GPR1*8+16]
+movapd    FPR9, [STR2 + GPR1*8+32]
+movapd    FPR10, [STR3 + GPR1*8+32]
+movapd    FPR11, [STR2 + GPR1*8+48]
+movapd    FPR12, [STR3 + GPR1*8+48]
+mulpd     FPR1, FPR5
+addpd     FPR1, FPR6
+mulpd     FPR2, FPR7
+addpd     FPR2, FPR8
+mulpd     FPR3, FPR9
+addpd     FPR3, FPR10
+mulpd     FPR4, FPR11
+addpd     FPR4, FPR12
+movapd    [STR0 + GPR1*8], FPR1
+movapd    [STR0 + GPR1*8+16], FPR2
+movapd    [STR0 + GPR1*8+32], FPR3
+movapd    [STR0 + GPR1*8+48], FPR4
+
diff --git a/bench/x86-64/update_avx.ptt b/bench/x86-64/update_avx.ptt
new file mode 100644
index 0000000..2e9178e
--- /dev/null
+++ b/bench/x86-64/update_avx.ptt
@@ -0,0 +1,15 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 16
+LOOP 16
+vmovaps    ymm1, [STR0 + GPR1 * 8]
+vmovaps    [STR0 + GPR1 * 8]     , ymm1
+vmovaps    ymm2, [STR0 + GPR1 * 8 + 32]
+vmovaps    ymm3, [STR0 + GPR1 * 8 + 64]
+vmovaps    ymm4, [STR0 + GPR1 * 8 + 96]
+vmovaps    [STR0 + GPR1 * 8 + 32], ymm2
+vmovaps    [STR0 + GPR1 * 8 + 64], ymm3
+vmovaps    [STR0 + GPR1 * 8 + 96], ymm4
+
+
diff --git a/bench/x86-64/update_plain.ptt b/bench/x86-64/update_plain.ptt
new file mode 100644
index 0000000..b5a3e4a
--- /dev/null
+++ b/bench/x86-64/update_plain.ptt
@@ -0,0 +1,15 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 16
+LOOP 4
+movss    FPR1, [STR0 + GPR1 * 8]
+movss    [STR0 + GPR1 * 8]     , FPR1
+movss    FPR2, [STR0 + GPR1 * 8 + 8]
+movss    FPR3, [STR0 + GPR1 * 8 + 16]
+movss    FPR4, [STR0 + GPR1 * 8 + 24]
+movss    [STR0 + GPR1 * 8 + 8], FPR2
+movss    [STR0 + GPR1 * 8 + 16], FPR3
+movss    [STR0 + GPR1 * 8 + 24], FPR4
+
+
diff --git a/bench/x86-64/update_sse.ptt b/bench/x86-64/update_sse.ptt
new file mode 100644
index 0000000..ac1129b
--- /dev/null
+++ b/bench/x86-64/update_sse.ptt
@@ -0,0 +1,15 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 16
+LOOP 8
+movaps    FPR1, [STR0 + GPR1 * 8]
+movaps    [STR0 + GPR1 * 8]     , FPR1
+movaps    FPR2, [STR0 + GPR1 * 8 + 16]
+movaps    FPR3, [STR0 + GPR1 * 8 + 32]
+movaps    FPR4, [STR0 + GPR1 * 8 + 48]
+movaps    [STR0 + GPR1 * 8 + 16], FPR2
+movaps    [STR0 + GPR1 * 8 + 32], FPR3
+movaps    [STR0 + GPR1 * 8 + 48], FPR4
+
+
diff --git a/bench/x86-64/vtriad_avx.ptt b/bench/x86-64/vtriad_avx.ptt
new file mode 100644
index 0000000..4a542d2
--- /dev/null
+++ b/bench/x86-64/vtriad_avx.ptt
@@ -0,0 +1,22 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+LOOP 16
+vmovaps    ymm1, [STR1 + GPR1*8]
+vmovaps    ymm2, [STR1 + GPR1*8+32]
+vmovaps    ymm3, [STR1 + GPR1*8+64]
+vmovaps    ymm4, [STR1 + GPR1*8+96]
+vmulpd    ymm1, ymm1, [STR2 + GPR1*8]
+vaddpd    ymm1, ymm1, [STR3 + GPR1*8]
+vmulpd    ymm2, ymm2, [STR2 + GPR1*8+32]
+vaddpd    ymm2, ymm2, [STR3 + GPR1*8+32]
+vmulpd    ymm3, ymm3, [STR2 + GPR1*8+64]
+vaddpd    ymm3, ymm3, [STR3 + GPR1*8+64]
+vmulpd    ymm4, ymm4, [STR2 + GPR1*8+96]
+vaddpd    ymm4, ymm4, [STR3 + GPR1*8+96]
+vmovaps    [STR0 + GPR1*8], ymm1
+vmovaps    [STR0 + GPR1*8+32], ymm2
+vmovaps    [STR0 + GPR1*8+64], ymm3
+vmovaps    [STR0 + GPR1*8+96], ymm4
+
diff --git a/bench/x86-64/vtriad_mem_avx.ptt b/bench/x86-64/vtriad_mem_avx.ptt
new file mode 100644
index 0000000..315ef14
--- /dev/null
+++ b/bench/x86-64/vtriad_mem_avx.ptt
@@ -0,0 +1,10 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+LOOP 4
+vmovaps    ymm1, [STR1 + GPR1*8]
+vmulpd     ymm1, ymm1, [STR2 + GPR1*8]
+vaddpd     ymm1, ymm1, [STR3 + GPR1*8]
+vmovntpd   [STR0 + GPR1*8], ymm1
+
diff --git a/bench/x86-64/vtriad_mem_sse.ptt b/bench/x86-64/vtriad_mem_sse.ptt
new file mode 100644
index 0000000..7c24748
--- /dev/null
+++ b/bench/x86-64/vtriad_mem_sse.ptt
@@ -0,0 +1,10 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+LOOP 2
+movaps    FPR1, [STR1 + GPR1*8]
+mulpd     FPR1, [STR2 + GPR1*8]
+addpd     FPR1, [STR3 + GPR1*8]
+movntpd   [STR0 + GPR1*8], FPR1
+
diff --git a/bench/x86-64/vtriad_plain.ptt b/bench/x86-64/vtriad_plain.ptt
new file mode 100644
index 0000000..120331c
--- /dev/null
+++ b/bench/x86-64/vtriad_plain.ptt
@@ -0,0 +1,22 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+LOOP 4
+movsd    FPR1, [STR1 + GPR1*8]
+movsd    FPR2, [STR1 + GPR1*8+8]
+movsd    FPR3, [STR1 + GPR1*8+16]
+movss    FPR4, [STR1 + GPR1*8+24]
+mulsd     FPR1, [STR2 + GPR1*8]
+addsd     FPR1, [STR3 + GPR1*8]
+mulsd     FPR2, [STR2 + GPR1*8+8]
+addsd     FPR2, [STR3 + GPR1*8+8]
+mulsd     FPR3, [STR2 + GPR1*8+16]
+addsd     FPR3, [STR3 + GPR1*8+16]
+mulsd     FPR4, [STR2 + GPR1*8+24]
+addsd     FPR4, [STR3 + GPR1*8+24]
+movsd    [STR0 + GPR1*8], FPR1
+movsd    [STR0 + GPR1*8+8], FPR2
+movsd    [STR0 + GPR1*8+16], FPR3
+movsd    [STR0 + GPR1*8+24], FPR4
+
diff --git a/bench/x86-64/vtriad_sse.ptt b/bench/x86-64/vtriad_sse.ptt
new file mode 100644
index 0000000..d521aa0
--- /dev/null
+++ b/bench/x86-64/vtriad_sse.ptt
@@ -0,0 +1,22 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+LOOP 8
+movaps    FPR1, [STR1 + GPR1*8]
+movaps    FPR2, [STR1 + GPR1*8+16]
+movaps    FPR3, [STR1 + GPR1*8+32]
+movaps    FPR4, [STR1 + GPR1*8+48]
+mulpd     FPR1, [STR2 + GPR1*8]
+addpd     FPR1, [STR3 + GPR1*8]
+mulpd     FPR2, [STR2 + GPR1*8+16]
+addpd     FPR2, [STR3 + GPR1*8+16]
+mulpd     FPR3, [STR2 + GPR1*8+32]
+addpd     FPR3, [STR3 + GPR1*8+32]
+mulpd     FPR4, [STR2 + GPR1*8+48]
+addpd     FPR4, [STR3 + GPR1*8+48]
+movaps    [STR0 + GPR1*8], FPR1
+movaps    [STR0 + GPR1*8+16], FPR2
+movaps    [STR0 + GPR1*8+32], FPR3
+movaps    [STR0 + GPR1*8+48], FPR4
+
diff --git a/config.mk b/config.mk
index a1caba3..2c3f3be 100644
--- a/config.mk
+++ b/config.mk
@@ -15,7 +15,6 @@ MANPREFIX = $(PREFIX)/man#NO SPACE
 # For the daemon based secure msr/pci access configure
 # the absolute path to the msr daemon executable.
 # Usually you can leave this to the default.
-# $(PREFIX)/sbin/likwid-accessD
 ACCESSDAEMON = $(PREFIX)/sbin/likwid-accessD#NO SPACE
 
 # Set the default mode for MSR access.
diff --git a/doc/feedGnuplot.1 b/doc/feedGnuplot.1
new file mode 100644
index 0000000..3d53986
--- /dev/null
+++ b/doc/feedGnuplot.1
@@ -0,0 +1,190 @@
+.TH feedGnuplot 1 <DATE> likwid\-<VERSION>
+.SH NAME
+feedGnuplot \- General purpose pipe-oriented plotting tool
+.SH SYNOPSIS
+.B likwid-setFreq 
+.IR <coreId>
+.IR <frequency>
+.IR [<governor>]
+
+.SH DESCRIPTION
+.B feedGnuplot
+is a pipe-oriented plotting frontend for GNUplot that can read internediate results and create a sort of live plot of the data.
+.B feedGnuplot
+is used by
+.B likwid-perfscope(1)
+to print performance counter data printed out by the timeline daemon mode of
+.B likwid-perfctr(1).
+The Perl script
+.B feedGnuplot
+is not written by the LIKWID Authors, it was written by Dima Kogan and published under GPL. The original web page is https://github.com/dkogan/feedgnuplot
+.SH OPTIONS
+.TP
+.B \-h
+prints a help message to standard output, then exits.#
+.TP
+.B \-\-[no]domain
+If enabled, the first element of each line is the domain variable.  If not, the point index is used.
+.TP
+.B \-\-[no]dataid
+If enabled, each data point is preceded by the ID of the data set that point corresponds to. This ID is
+interpreted as a string, NOT as just a number. If not enabled, the order of the point is used.
+.TP
+.B \-\-[no]3d
+Do [not] plot in 3D. This only makes sense with 
+.B --domain.
+Each domain here is an (x,y) tuple.
+.TP
+.B \-\-colormap
+Show a colormapped xy plot. Requires extra data for the color. zmin/zmax can be used to set the extents of the colors.
+Automatically increments extraValuesPerPoint.
+.TP
+.B \-\-[no]stream
+Do [not] display the data a point at a time, as it comes in.
+.TP
+.B \-\-[no]lines
+Do [not] draw lines to connect consecutive points.
+.TP
+.B \-\-[no]points
+Do [not] draw points.
+.TP
+.B \-\-circles
+Plot with circles. This requires a radius be specified for each point. Automatically increments extraValuesPerPoint.
+.TP
+.B \-\-xlabel " xxx
+Set x-axis label.
+.TP
+.B \-\-ylabel " xxx
+Set y-axis label.
+.TP
+.B \-\-y2label " xxx
+Set y2-axis label. Does not apply to 3d plots.
+.TP
+.B \-\-zlabel " xxx
+Set z-axis label. Only applies to 3d plots.
+.TP
+.B \-\-title " xxx
+Set the title of the plot.
+.TP
+.B \-\-legend " curveID=legend
+Set the label for a curve plot. Use this option multiple times for multiple curves. With 
+.B --dataid
+, curveID is the ID. Otherwise, it's the index of the curve, starting at 0.
+.TP
+.B \-\-autolegend
+Use the curve IDs for the legend. Titles given with
+.B --legend
+override these.
+.TP
+.B \-\-xlen " xxx
+When using 
+.B --stream
+, sets the size of the x-window to plot. Omit this or set it to 0 to plot ALL the data. Does not make sense with 3d plots. Implies
+.B --monotonic
+.TP
+.B \-\-xmin " xxx
+Set the minimal point in range for the x-axis. These are ignored in a streaming plot.
+.TP
+.B \-\-xmax " xxx
+Set the maximal point in range for the x-axis. These are ignored in a streaming plot.
+.TP
+.B \-\-ymin " xxx
+Set the minimal point in range for the y-axis.
+.TP
+.B \-\-ymax " xxx
+Set the maximal point in range for the y-axis.
+.TP
+.B \-\-y2min " xxx
+Set the minimal point in range for the y2-axis. Does not apply to 3d plots.
+.TP
+.B \-\-y2max " xxx
+Set the maximal point in range for the y2-axis. Does not apply to 3d plots.
+.TP
+.B \-\-zmin " xxx
+Set the minimal point in range for the z-axis. Only applies to 3d plots or colormaps.
+.TP
+.B \-\-zmax " xxx
+Set the maximal point in range for the z-axis. Only applies to 3d plots or colormaps.
+.TP
+.B \-\-y2 " xxx
+Plot the data specified by this curve ID on the y2 axis. Without
+.B --dataid
+, the ID is just an ordered 0-based index. Does not apply to 3d plots.
+.TP
+.B \-\-curvestyle " curveID=style
+Additional styles per curve. With
+.B --dataid
+, curveID is the ID. Otherwise, it's the index of the curve, starting at 0. Use this option multiple times for multiple curves.
+.TP
+.B \-\-curvestyleall " xxx
+Additional styles for ALL curves.
+.TP
+.B \-\-extracmds " xxx
+Additional commands. These could contain extra global styles for instance.
+.TP
+.B \-\-size " xxx
+Gnuplot size option.
+.TP
+.B \-\-square
+Plot data with aspect ratio 1. For 3D plots, this controls the aspect ratio for all 3 axes.
+.TP
+.B \-\-square_xy
+For 3D plots, set square aspect ratio for ONLY the x,y axes.
+.TP
+.B \-\-hardcopy " xxx
+If not streaming, output to a file specified here. Format inferred from filename.
+.TP
+.B \-\-maxcurves " xxx
+The maximum allowed number of curves. This is 100 by default, but can be reset with this option. This exists purely to prevent perl from allocating all of the system's memory when reading bogus data.
+.TP
+.B \-\-monotonic
+If
+.B --domain
+is given, checks to make sure that the x-coordinate in the input data is monotonically increasing.If a given x-variable is in the past, all data currently cached for this curve is purged. Without 
+.B --monotonic
+, all data is kept. Does not make sense with 3d plots. No 
+.B --monotonic
+by default.
+.TP
+.B \-\-extraValuesPerPoint " xxx
+How many extra values are given for each data point. Normally this is 0, and does not need to be specified, but sometimes we want extra data, like for colors or point sizes or error bars, etc.
+.B feedGnuplot
+options that require this (colormap, circles) automatically set it. This option is ONLY needed if unknown styles are used, with 
+.B --curvestyleall
+for instance.
+.TP
+.B \-\-dump
+Instead of printing to gnuplot, print to STDOUT. For debugging.
+
+.SH EXAMPLE
+.IP 1. 4
+Simple real-time plotting example: plot how much data is received on the wlan0 network interface in bytes/second
+.TP
+.B while true; do sleep 1; cat /proc/net/dev; done | gawk '/wlan0/ {if(b) {print $2-b; fflush()} b=$2}' | \\
+.B feedgnuplot --lines --stream --xlen 10 --ylabel 'Bytes/sec' --xlabel seconds
+.PP
+Reads the stats of the network interface 'wlan0' every second, reformats it with
+.B gawk
+and pipes the formated output into
+.B feedGnuplot
+qto create a line plot (
+.B --lines
+) of the streaming input (
+.B --stream
+). Always show the last 10 seconds (
+.B --xlen
+) and use the labels 'seconds' for the x-axis and 'Bytes/sec' for the y-axis.
+.IP 2. 4
+Simple real-time plotting example: plot the 'idle' CPU consumption against time
+.TP
+.B sar 1 -1 | awk '$1 ~ /..:..:../ && $8 ~/^[0-9\.]*$/ {print $1,$8; fflush()}' | \\
+.B feedgnuplot --stream --domain --lines --timefmt '%H:%M:%S' --set 'format x "%H:%M:%S"'
+.PP
+Reads the CPU IDLE consumption and sets the current time as x-axis key.
+
+.SH AUTHOR
+Written by Dima Kogan <dima at secretsauce.net>.
+.SH BUGS
+Report Bugs on <https://github.com/dkogan/feedgnuplot/issues>.
+.SH "SEE ALSO"
+gnuplot(1), awk(1), sar(1),  likwid-perfscope(1), likwid-perfctr(1)
diff --git a/doc/likwid-accessD.1 b/doc/likwid-accessD.1
new file mode 100644
index 0000000..7d444af
--- /dev/null
+++ b/doc/likwid-accessD.1
@@ -0,0 +1,22 @@
+.TH LIKWID-ACCESSD 1 <DATE> likwid\-<VERSION>
+.SH NAME
+likwid-accessD \- This tool forwards the access operations from LIKWID PerfMon tools
+to the MSR device files
+.SH DESCRIPTION
+.B likwid-accessD
+is a command line application that opens a UNIX file socket and waits for access
+operations from LIKWID tools that require access to the MSR and PCI device
+files. The MSR and PCI device files are only accessible for users with root
+privileges, therefore
+.B likwid-accessD
+requires the suid-bit set.
+Depending on the current system architecture,
+.B likwid-accessD
+permits only access to registers defined for the architecture.
+
+.SH AUTHOR
+Written by Thomas Roehl <thomas.roehl at gmail.com>.
+.SH BUGS
+Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+.SH "SEE ALSO"
+likwid-perfctr(1), likwid-powermeter(1), likwid-features(1), likwid-pin(1), likwid-topology(1),
diff --git a/doc/likwid-bench.1 b/doc/likwid-bench.1
index b7414a0..45d0f6c 100644
--- a/doc/likwid-bench.1
+++ b/doc/likwid-bench.1
@@ -1,19 +1,20 @@
 .TH LIKWID-BENCH 1 <DATE> likwid\-<VERSION>
+.WARN
 .SH NAME
 likwid-bench \- low-level benchmark suite and microbenchmarking framework
 .SH SYNOPSIS
 .B likwid-bench
 .RB [\-hap]
 .RB [ \-l
-.IR testname ] 
+.IR <testname> ] 
 .RB [ \-i
-.IR iterations ]
+.IR <iterations> ]
 .RB [ \-g
-.IR number_of_workgroups ]
+.IR <number_of_workgroups> ]
 .RB [ \-t
-.IR testname ]
+.IR <testname> ]
 .RB [ \-w
-.IR workgroup_expression ]
+.IR <workgroup_expression> ]
 .SH DESCRIPTION
 .B likwid-bench
 is a benchmark suite for low-level (assembly) benchmarks to measure bandwidths and instruction throughput for specific instruction code on x86 systems. The currently included benchmark codes include common data access patterns like load and store but also calculations like vector triad and sum.
@@ -38,25 +39,25 @@ list available benchmark codes for the current system.
 .B \-\^p
 list available thread domains.
 .TP
-.B \-\^l " testname"
+.B \-\^l " <testname>"
 list properties of a benchmark code.
 .TP
-.B \-\^i " iterations"
+.B \-\^i " <iterations>"
 number of iterations to perform inside the benchmark code.
 .TP
-.B \-\^t " testname"
+.B \-\^t " <testname>"
 Name of the benchmark code to run (mandatory).
 .TP
-.B \-\^g " number_of_workgroups"
+.B \-\^g " <number_of_workgroups>"
 specify the number of workgroups to perform the benchmark code on (mandatory).
 .TP
-.B \-\^w " workgroup_expression"
+.B \-\^w " <workgroup_expression>"
 Specify the affinity domain, thread count and data set size for the current benchmarking run (mandatory).
 
 .SH WORKGROUP SYNTAX
-.B <thread_domain>:<size>[:<num_threads>[:<chunk_size>:<stride>]][-<streamId>:<domain_id>]
-with size in kB, MB or GB.
-Where thread domain is where threads are placed. Size is the total data set size for the benchmark. num_threads specifies how many threads are used. Threads are always placed using a compact policy in
+
+.B <thread_domain>:<size> [:<num_threads>[:<chunk_size>:<stride>]] [-<streamId>:<domain_id>]
+with size in kB, MB or GB. Where thread domain is where threads are placed. Size is the total data set size for the benchmark. num_threads specifies how many threads are used. Threads are always placed using a compact policy in
 .B likwid-bench.
 This means that per default all SMT threads are used. Optionally similar a the expression based syntax in
 .B likwid-pin
@@ -95,8 +96,8 @@ The option INSTRUMENT_BENCH in config.mk needs to be true at compile time to use
 .PP
 .B likwid-perfctr 
 will configure and start the performance counters on socket 0 with 4 threads prior to the execution of
-.B likwid-bench
-. The performance counters are read right before and after running the benchmarking code to 
+.B likwid-bench.
+The performance counters are read right before and after running the benchmarking code to 
 minimize the interferences of the measurement.
 .IP 5. 4
 Run the copy benchmark and place the data on other socket
diff --git a/doc/likwid-features.1 b/doc/likwid-features.1
index 589b2eb..e67cf44 100644
--- a/doc/likwid-features.1
+++ b/doc/likwid-features.1
@@ -5,9 +5,11 @@ likwid-features \- print and toggle the flags of the MSR_IA32_MISC_ENABLE model
 .B likwid-features 
 .RB [ \-vh ]
 .RB [ \-c
-.IR coreId ]
-.RB [ \-su
-.IR prefetcher_tag ]
+.IR <coreId> ]
+.RB [ \-s
+.IR <prefetcher_tag> ]
+.RB [ \-u
+.IR <prefetcher_tag> ]
 .SH DESCRIPTION
 .B likwid-features
 is a command line application to print the flags in the model
@@ -41,13 +43,13 @@ prints version information to standard output, then exits.
 .B \-\^h
 prints a help message to standard output, then exits.
 .TP
-.B \-\^c " coreId"
+.B \-\^c " <coreId>"
 set on which processor core the MSR should be read
 .TP
-.B \-\^u " HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER"
+.B \-\^u " <HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER>"
 specify which prefetcher to unset
 .TP
-.B \-\^s " HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER"
+.B \-\^s " <HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER>"
 specify which prefetcher to set
 
 .SH AUTHOR
diff --git a/doc/likwid-genCfg.1 b/doc/likwid-genCfg.1
new file mode 100644
index 0000000..8b7632f
--- /dev/null
+++ b/doc/likwid-genCfg.1
@@ -0,0 +1,30 @@
+.TH LIKWID-GENCFG 1 <DATE> likwid\-<VERSION>
+.SH NAME
+likwid-genCfg \- Get system topology and write them to file for faster LIKWID startup
+.SH SYNOPSIS
+.B likwid-genCfg
+.RB [\-hv]
+.RB [ \-o
+.IR <filename>]
+.SH DESCRIPTION
+.B likwid-genCfg
+is a command line application that stores the system's CPU and NUMA topology to
+file. LIKWID applications use this file to read in the topology fast instead of
+re-gathering all values. The default output path is /etc/likwid.cfg.
+.SH OPTIONS
+.TP
+.B \-h
+prints a help message to standard output, then exits.
+.TP
+.B \-v
+prints a version message to standard output, then exits.
+.TP
+.B \-\^o " <filename>
+sets output file path (optional)
+
+.SH AUTHOR
+Written by Thomas Roehl <thomas.roehl at gmail.com>.
+.SH BUGS
+Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+.SH "SEE ALSO"
+likwid-topology(1), likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1)
diff --git a/doc/likwid-memsweeper.1 b/doc/likwid-memsweeper.1
new file mode 100644
index 0000000..f474360
--- /dev/null
+++ b/doc/likwid-memsweeper.1
@@ -0,0 +1,28 @@
+.TH LIKWID-MEMSWEEPER 1 <DATE> likwid\-<VERSION>
+.SH NAME
+likwid-memsweeper \- A tool to clean up NUMA memory domains and last level caches.
+.SH SYNOPSIS
+.B likwid-memsweeper
+.RB [\-hv]
+.RB [ \-c
+.IR <NUMA_ID> ]
+.SH DESCRIPTION
+.B likwid-memsweeper
+is a command line application to shrink the file buffer cache by filling the NUMA domain with random pages. Moreover the tool invalidates all cachelines in the LLC for 64 bit x86 systems. If no NUMA domain is specified, all are sweept.
+.SH OPTIONS
+.TP
+.B \-h
+prints a help message to standard output, then exits.
+.TP
+.B \-v
+prints a version message to standard output, then exits.
+.TP
+.B \-\^c " <NUMA_ID>
+set the NUMA domain for sweeping.
+
+.SH AUTHOR
+Written by Thomas Roehl <thomas.roehl at gmail.com>.
+.SH BUGS
+Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+.SH "SEE ALSO"
+likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1), likwid-topology(1),
diff --git a/doc/likwid-mpirun.1 b/doc/likwid-mpirun.1
new file mode 100644
index 0000000..765b0c8
--- /dev/null
+++ b/doc/likwid-mpirun.1
@@ -0,0 +1,81 @@
+.TH LIKWID-MPIRUN 1 <DATE> likwid\-<VERSION>
+.SH NAME
+likwid-mpirun \- A tool to start and monitor MPI applications with LIKWID
+.SH SYNOPSIS
+.B likwid-memsweeper
+.RB [\-hd]
+.RB [ \-hostfile
+.IR filename ]
+.RB [ \-nperdomain
+.IR number_of_processes_in_domain ]
+.RB [ \-pin
+.IR expression ]
+.RB [ \-omp
+.IR expression ]
+.RB [ \-mpi
+.IR expression ]
+.RB [\-\-]
+.SH DESCRIPTION
+.B likwid-mpirun
+is a command line application that wraps the vendor-specific mpirun tool and adds calls to
+.B likwid-perfctr(1)
+to the execution string. The user-given application is ran, measured and the results returned to the staring node.
+.SH OPTIONS
+.TP
+.B \-h
+prints a help message to standard output, then exits.
+.TP
+.B \-d
+prints debug messages to standard output.
+.TP
+.B \-\^hostfile " filename
+specifies the nodes to schedule the MPI processes on
+.TP
+.B \-\^nperdomain " number_of_processes_in_domain
+specifies the processes per affinity domain (see
+.B likwid-pin
+for info about affinity domains)
+.TP
+.B \-\^pin " expression
+specifies the pinning for hybrid execution (see
+.B likwid-pin
+for info about affinity domains)
+.TP
+.B \-\^omp " expression
+enables hybrid setup. Can only be used in combination with
+.B -pin.
+The only possible value is: intel
+.TP
+.B \-\^mpi " expression
+specifies the MPI implementation that should be used by the wrapper. Possible values are intelmpi, openmpi and mvapich2
+.TP
+.B \-\-
+stops parsing arguments for likwid-mpirun, in order to set options for underlying MPI implementation after \-\-.
+
+.SH EXAMPLE
+.IP 1. 4
+For standard application:
+.TP
+.B likwid-mpirun -np 32  ./myApp
+.PP
+Will run 32 MPI processes, each host is filled with as much processes as written in ppn
+.IP 2. 4
+With pinning:
+.TP
+.B likwid-mpirun -np 32 -nperdomain S:2  ./myApp
+.PP
+Will start 32 MPI processes with 2 processes per socket.
+.IP 3. 4
+For hybrid runs:
+.TP
+.B likwid-mpirun -np 32 -pin M0:0-3_M1:0-3  ./myApp
+.PP
+Will start 32 MPI processes with 2 processes per node. Threads of the first process are pinned to the cores 0-3 in NUMA domain 0 (M0). The OpenMP threads of the second process are pinned to the first four cores in NUMA domain 1 (M1)
+
+
+.SH AUTHOR
+Written by Thomas Roehl <thomas.roehl at gmail.com>.
+.SH BUGS
+Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+.SH "SEE ALSO"
+likwid-pin(1), likwid-perfctr(1), likwid-features(1), likwid-powermeter(1), likwid-topology(1),
diff --git a/doc/likwid-perfctr.1 b/doc/likwid-perfctr.1
index 0156136..ea3e4f3 100644
--- a/doc/likwid-perfctr.1
+++ b/doc/likwid-perfctr.1
@@ -5,19 +5,19 @@ likwid-perfctr \- configure and read out hardware performance counters on x86 cp
 .B likwid-perfctr 
 .RB [\-vhHVmaeiMoO]
 .RB [ \-c/\-C
-.IR core_list ]
+.IR <core_list> ]
 .RB [ \-g
-.IR performance_group
+.IR <performance_group>
 or
-.IR performance_event_string ]
+.IR <performance_event_string> ]
 .RB [ \-t
-.IR frequency ]
+.IR <frequency> ]
 .RB [ \-S
-.IR time ]
+.IR <time> ]
 .RB [ \-s
-.IR skip_mask ]
+.IR <skip_mask> ]
 .RB [ \-o
-.IR output_file ]
+.IR <output_file> ]
 .SH DESCRIPTION
 .B likwid-perfctr
 is a lightweight command line application to configure and read out hardware performance monitoring data
@@ -29,46 +29,66 @@ The following x86 processors are supported:
 .IP \[bu] 
 .B Intel Core 2:
 all variants. Counters:
-.I PMC0, PMC1, FIXC0, FIXC1, FIXC2
+.I PMC[0-1], FIXC[0-2]
 .IP \[bu] 
 .B Intel Nehalem:
-all variants. Counters:
-.I PMC0, PMC1, PMC2, PMC3, UPMC0 - UPMC7, FIXC0, FIXC1, FIXC2
+Counters:
+.I PMC[0-3], FIXC[0-2], UPMC[0-7]
 .IP \[bu] 
 .B Intel Nehalem EX:
-all variants, no uncore for the moment. Counters:
-.I PMC0, PMC1, PMC2, PMC3, FIXC0, FIXC1, FIXC2
+Counters:
+.I PMC[0-3], FIXC[0-2], MBOX[0-1]C[0-5], BBOX[0-1]C[0-3], RBOX[0-1]C[0-7], WBOX[0-5], UBOX0, SBOX[0-1]C[0-3], CBOX[0-9]C[0-4]
 .IP \[bu] 
 .B Intel Westmere:
-all variants. Counters:
-.I PMC0, PMC1, PMC2, PMC3, UPMC0 - UPMC7, FIXC0, FIXC1, FIXC2
+ Counters:
+.I PMC[0-3], FIXC[0-2], UPMC[0-7]
+.IP \[bu] 
+.B Intel Westmere EX:
+Counters:
+.I PMC[0-3], FIXC[0-2], MBOX[0-1]C[0-5], BBOX[0-1]C[0-3], RBOX[0-1]C[0-7], WBOX[0-5], UBOX0, SBOX[0-1]C[0-3], CBOX[0-9]C[0-4]
 .IP \[bu] 
 .B Intel Sandy Bridge:
-all variants, partial support for uncore, full RAPL support. Counters:
-.I PMC0, PMC1, PMC2, PMC3, FIXC0, FIXC1, FIXC2
+full RAPL support. Counters:
+.I PMC[0-3], FIXC[0-2], PWR[0-3]
+.IP \[bu] 
+.B Intel Sandy Bridge EP:
+partial support for uncore, full RAPL support. Counters:
+.I PMC[0-3], FIXC[0-2], PWR[0-3]. MBOX[0-3]C[0-3]
 .IP \[bu] 
 .B Intel Ivy Bridge:
-all variants, partial support for uncore, full RAPL support. Counters:
-.I PMC0, PMC1, PMC2, PMC3, FIXC0, FIXC1, FIXC2
+full RAPL support. Counters:
+.I PMC[0-3], FIXC[0-2], PWR[0-3]
+.IP \[bu] 
+.B Intel Ivy Bridge EP:
+partial support for uncore, full RAPL support. Counters:
+.I PMC[0-3], FIXC[0-2], PWR[0-3], CBOX[0-9]C[0-3], MBOX[0-3]C[0-3], MBOX[0-3]FIX
 .IP \[bu] 
 .B Intel Haswell:
-only desktop variants, full RAPL support. Counters:
-.I PMC0, PMC1, PMC2, PMC3, FIXC0, FIXC1, FIXC2
+full RAPL support. Counters:
+.I PMC[0-3], FIXC[0-2], PWR[0-3]
+.IP \[bu] 
+.B Intel Haswell EP:
+no uncore support, full RAPL support. Counters:
+.I PMC[0-3], FIXC[0-2], PWR[0-3]
+.IP \[bu] 
+.B Intel Atom Silvermont:
+full RAPL support. Counters:
+.I PMC[0-1], FIXC[0-2], PWR[0-1]
 .IP \[bu] 
 .B Intel Pentium M:
 Banias and Dothan variants. Counters:
-.I PMC0, PMC1
+.I PMC[0-1]
 .IP \[bu] 
 .B Intel P6:
 Tested on P3.
 .IP \[bu] 
 .B AMD K8:
 all variants. Counters:
-.I PMC0, PMC1, PMC2, PMC3
+.I PMC[0-3]
 .IP \[bu] 
 .B AMD K10:
 Barcelona, Shanghai, Istanbul, MagnyCours based processors. Counters:
-.I PMC0, PMC1, PMC2, PMC3
+.I PMC[0-3]
 
 .SH OPTIONS
 .TP
@@ -93,7 +113,7 @@ print available performance groups for current processor.
 .B \-\^e
 print available counters and performance events of current processor.
 .TP
-.B \-\^o
+.B \-\^o " <filename>
 store all ouput to a file instead of stdout. For the filename the following placeholders are supported: 
 %j for PBS_JOBID, %r for MPI RANK (only Intel MPI at the moment), %h hostname and %p for process pid.
 The placeholders must be separated by underscore as, e.g., -o test_%h_%p. You must specify a suffix to
@@ -106,26 +126,26 @@ Do not print tables for results, use easily parseable CSV instead.
 .B \-\^i
 print cpuid information about processor and on Intel Performance Monitoring features, then exit.
 .TP
-.B \-\^c " processor_list"
+.B \-\^c " <processor_list>"
 specify a numerical list of processors. The list may contain multiple 
 items, separated by comma, and ranges. For example 0,3,9-11.
 .TP
-.B \-\^C " processor_list"
+.B \-\^C " <processor_list>"
 specify a numerical list of processors. The list may contain multiple 
 items, separated by comma, and ranges. For example 0,3,9-11. This variant will
 also pin the threads to the cores. Also logical numberings can be used.
 .TP
-.B \-\^g " performance group or performance event set string"
+.B \-\^g " <performance group> or <performance event set string>"
 specify which performance group to measure. This can be one of the tags output with the -a flag.
 Also a custom event set can be specified by a comma separated list of events. Each event has the format
 eventId:register with the the register being one of a architecture supported performance counter registers.
 .TP
-.B \-\^t " frequency of measurements in seconds"
-timeline mode for time resolved measurements. The output has the format:
+.B \-\^t " <frequency of measurements>"
+timeline mode for time resolved measurements, possible suffixes 's' and 'ms' like 100ms. The output has the format:
 .TP
 .B <Event> <Timestamp> <Result thread0> <Result thread1> ...
 .TP
-.B \-\^S " time_in_seconds"
+.B \-\^S " <time_in_seconds>"
 stethoscope mode with duration in senconds. Can be used to measure an application from the outside.
 
 .SH EXAMPLE
@@ -137,13 +157,13 @@ or use the builtin pin functionality.
 .IP 1. 4
 As wrapper with performance group:
 .TP
-.B likwid-perfctr  -C 0-2  -g TLB   ./cacheBench -n 2 -l 1048576 -i 100 -t Stream
+.B likwid-perfctr -C 0-2 -g TLB ./cacheBench -n 2 -l 1048576 -i 100 -t Stream
 .PP
 The parent process is pinned to processor 0, Thread 0 to processor 1 and Thread 1 to processor 2.
 .IP 2. 4
 As wrapper with custom event set on AMD:
 .TP
-.B likwid-perfctr  -C 0-4  -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3   ./cacheBench
+.B likwid-perfctr -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./myApp
 .PP
 It is specified that the event
 .B INSTRUCTIONS_RETIRED_SSE
@@ -160,7 +180,7 @@ event. If you want this you have to include this event in your custom event stri
 .IP 3. 4
 As wrapper with custom event set on Intel:
 .TP
-.B likwid-perfctr  -C 0  -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,UNC_L3_LINES_IN_ANY:UPMC0  ./stream-icc
+.B likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1 ./myApp
 .PP
 On Intel processors fixed events are measured on dedicated counters. These are
 .B INSTR_RETIRED_ANY
@@ -175,7 +195,7 @@ will calculate the runtime and CPI metrics for your run.
 .IP 4. 4
 Using the marker API to measure only parts of your code (this can be used both with groups or custom event sets):
 .TP
-.B likwid-perfctr -m -C 0-4  -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3   ./cacheBench
+.B likwid-perfctr -m -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./cacheBench
 .PP
 You have to link you code against liblikwid.a/.so and use the marker API calls.
 The following code snippet shows the necessary calls:
@@ -211,18 +231,22 @@ if (threadId == 0)
 .IP 5. 4
 Using likwid in timeline mode:
 .TP
-.B likwid-perfctr -c 0-3  -g FLOPS_DP -t 300ms  ./cacheBench > out.txt
+.B likwid-perfctr -c 0-3 -g FLOPS_DP -t 300ms  ./myApp > out.txt
 .PP
 This will read out the counters every 300ms on physical cores 0-3 and write the results to out.txt.
 For timeline mode there is a frontend application likwid-scope, which enables live plotting of selected events.
-For more code examples have a look at the likwid WIKI pages.
+For more code examples have a look at the likwid WIKI pages. The processes are
+.B not
+pinned to the CPUs 0-3.
 
 .IP 6. 4
 Using likwid in stethoscope mode:
 .TP
-.B likwid-perfctr -c 0-3  -g FLOPS_DP -S 2s
+.B likwid-perfctr -c 0-3 -g FLOPS_DP -S 2s
 .PP
-This will start the counters and read them out after 2s on physical cores 0-3 and write the results to stdout.
+This will start the counters and read them out after 2s on physical cores 0-3 and write the results to stdout. The processes are
+.B not
+pinned to the CPUs 0-3.
 
 .SH AUTHOR
 Written by Jan Treibig <jan.treibig at gmail.com>.
diff --git a/doc/likwid-perfscope.1 b/doc/likwid-perfscope.1
new file mode 100644
index 0000000..2d48e21
--- /dev/null
+++ b/doc/likwid-perfscope.1
@@ -0,0 +1,55 @@
+.TH LIKWID-PERFSCOPE 1 <DATE> likwid\-<VERSION>
+.SH NAME
+likwid-perfscope \- Frontend for the timeline mode of
+.N likwid-perfctr(1)
+that on-the-fly generates pictures from the measurements
+.SH SYNOPSIS
+.B likwid-perfscope 
+.RB [\-h]
+.RB [ \-cores
+.IR <cpu_list> ]
+.RB [ \-freq
+.IR <frequency> ]
+.RB [ \-group
+.IR <eventset> ]
+.SH DESCRIPTION
+.B likwid-perfscope
+is a command line application written in Perl that uses the timeline daemon mode of
+.B likwid-perfctr(1)
+to create on-the-fly pictures with the current measurements. It uses the
+.B feedGnuplot(1)
+script to send the current data to gnuplot.
+.SH OPTIONS
+.TP
+.B \-h
+prints a help message to standard output, then exits.
+.TP
+.B \-\^cores " <cpu_list>
+measures the given group on given CPUs in <cpu_list>
+.TP
+.B \-\^freq " <frequency>
+reads the current performance values every <frequency>. Available suffixes are 's' and 'ms', e.g. 500ms. Default value is 1s
+.TP
+.B \-\^group " <eventset>
+defines the events and counters that should be read. Possible values can be gathered from
+.B likwid-perfctr(1).
+Default is group 'FLOPS_DP'
+
+.SH EXAMPLE
+.IP 1. 4
+Monitor double precision floating-point operations:
+.TP
+.B likwid-perfscope -group FLOPS_DP -cores 0-3 -freq 500ms
+.PP
+Executes
+.B likwid-perfctr
+on the first four cores. The values are read every 500ms are forwarded to gnuplot using the
+.B feedGnuplot
+script.
+
+.SH AUTHOR
+Written by Jan Treibig <jan.treibig at gmail.com>.
+.SH BUGS
+Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+.SH "SEE ALSO"
+likwid-perfctr(1), feedGnuplot(1), likwid-pin(1), likwid-powermeter(1), likwid-setFrequencies(1)
diff --git a/doc/likwid-pin.1 b/doc/likwid-pin.1
index 559f47f..efea873 100644
--- a/doc/likwid-pin.1
+++ b/doc/likwid-pin.1
@@ -3,20 +3,18 @@
 likwid-pin \- pin a sequential or threaded application to dedicated processors
 .SH SYNOPSIS
 .B likwid-pin 
-.RB [\-vhqip]
+.RB [\-vhqipS]
 .RB [ \-c
-.IR core_list ]
+.IR <core_list> ]
 .RB [ \-s
-.IR skip_mask ]
-.RB [ \-S
-.IR Sweep_memory_before_run]
+.IR <skip_mask> ]
 .RB [ \-d
-.IR delimiter ]
+.IR <delimiter> ]
 .SH DESCRIPTION
 .B likwid-pin
 is a command line application to pin a sequential or multithreaded 
 applications to dedicated processors. It can be used as replacement for 
-.B taskset(1). 
+.B taskset(1).
 Opposite to taskset no affinity mask but single processors are specified.
 For multithreaded applications based on the pthread library the 
 .I pthread_create
@@ -27,8 +25,8 @@ to a dedicated processor as specified in
 .PP
 Per default every generated thread is pinned to the core in the order of calls 
 to 
-.I pthread_create
-. It is possible to skip single threads using -s commandline option.
+.I pthread_create.
+It is possible to skip single threads using -s commandline option.
 .PP
 For OpenMP implementations gcc and icc compilers are explicitly supported. Others may also work.
 .B likwid-pin
@@ -67,7 +65,7 @@ prints version information to standard output, then exits.
 .B \-\^h
 prints a help message to standard output, then exits.
 .TP
-.B \-\^c " processor_list OR thread expression OR scatter policy "
+.B \-\^c " <processor_list> OR <thread_expression> OR <scatter policy> "
 specify a numerical list of processors. The list may contain multiple 
 items, separated by comma, and ranges. For example 0,3,9-11. You can also use
 logical numberings, either within a node (N), a socket (S<id>) or a numa domain (M<id>).
@@ -75,10 +73,10 @@ likwid-pin also supports logical pinning within a cpuset with a L prefix. If you
 likwid-pin will pin the threads to the processors on the node with physical cores first.
 See below for details on using a thread expression or scatter policy
 .TP
-.B \-\^s " skip_mask
+.B \-\^s " <skip_mask>
 Specify skip mask as HEX number. For each set bit the corresponding thread is skipped.
 .TP
-.B \-\^S " enable memory sweeper
+.B \-\^S
 All ccNUMA memory domains belonging to the specified threadlist will be cleaned before the run. Can solve file buffer cache problems on Linux.
 .TP
 .B \-\^p
@@ -90,7 +88,7 @@ set numa memory policy to interleave spanning all numa nodes involved in pinning
 .B \-\^q
 silent execution without output
 .TP
-.B \-\^d
+.B \-\^d " <delimiter>
 set delimiter used to output the physical processor list (-p & -c)
 
 
diff --git a/doc/likwid-powermeter.1 b/doc/likwid-powermeter.1
index 14dce68..f4a3ba2 100644
--- a/doc/likwid-powermeter.1
+++ b/doc/likwid-powermeter.1
@@ -1,15 +1,15 @@
 .TH LIKWID-POWERMETER 1 <DATE> likwid\-<VERSION>
 .SH NAME
-likwid-powermeter \- A tool to print Power and Clocking information on Intel CPUS
+likwid-powermeter \- A tool to print power and clocking information on Intel CPUs
 .SH SYNOPSIS
 .B likwid-powermeter 
 .RB [ \-vhip ]
 .RB [ \-c
-.IR socket_list ]
+.IR <socket_list> ]
 .RB [ \-s
-.IR duration_in_seconds ]
+.IR <duration_in_seconds> ]
 .RB [ \-M
-.IR access mode (0=direct, 1=accessDaemon) ]
+.IR <access_mode>]
 .SH DESCRIPTION
 .B likwid-powermeter
 is a command line application to get the energy comsumption of Intel RAPL capable processors. 
@@ -17,7 +17,8 @@ It also prints information about TDP and Turbo Mode steps supported.
 The Turbo Mode information works on all Turbo mode enabled Intel processors. The tool can be either used
 in stethoscope mode for a specified duration or as a wrapper to your application measuring your complete 
 run. RAPL works on a per package (socket) base.
-Please note that the RAPL counters are also accessible as normal events within likwid-perfctr.
+Please note that the RAPL counters are also accessible as normal events within
+.B likwid-perfctr.
 .SH OPTIONS
 .TP
 .B \-\^v
@@ -26,7 +27,7 @@ prints version information to standard output, then exits.
 .B \-\^h
 prints a help message to standard output, then exits.
 .TP
-.B \-\^c " socket_list"
+.B \-\^c " <socket_list>"
 set on which sockets the RAPL interface is accessed. comma-separated list of socket IDs
 .TP
 .B \-\^p
@@ -35,10 +36,10 @@ prints out information about dynamic clocks and CPI information on the socket me
 .B \-\^i
 prints out information TDP and Turbo mode steps
 .TP
-.B \-\^M
+.B \-\^M " <access_mode>"
 set the access method. 0 for direct access to MSR/RAPL registers, 1 for using the accessDaemon.
 .TP
-.B \-\^s
+.B \-\^s " <duration_in_seconds>
 measure the power for a specific time (default 2s)
 
 
diff --git a/doc/likwid-setFreq.1 b/doc/likwid-setFreq.1
new file mode 100644
index 0000000..87054c7
--- /dev/null
+++ b/doc/likwid-setFreq.1
@@ -0,0 +1,24 @@
+.TH LIKWID-SETFREQ 1 <DATE> likwid\-<VERSION>
+.SH NAME
+likwid-setFreq \- Mediator for
+.B likwid-setFrequencies(1)
+that performs the actual setting of CPU cores' frequency and governor.
+.SH SYNOPSIS
+.B likwid-setFreq 
+.IR <coreId>
+.IR <frequency>
+.IR [<governor>]
+
+.SH DESCRIPTION
+.B likwid-setFreq
+is a command line application that mediates the request from
+.B likwid-setFrequencies(1)
+because setting a CPU core's frequency and/or governor requires root privileges. This executable must be suid-root.
+
+
+.SH AUTHOR
+Written by Jan Treibig <jan.treibig at gmail.com>.
+.SH BUGS
+Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+.SH "SEE ALSO"
+likwid-setFrequencies(1), likwid-perfctr(1), feedGnuplot(1), likwid-pin(1), likwid-powermeter(1)
diff --git a/doc/likwid-setFrequencies.1 b/doc/likwid-setFrequencies.1
index 50d70a9..b268280 100644
--- a/doc/likwid-setFrequencies.1
+++ b/doc/likwid-setFrequencies.1
@@ -5,16 +5,16 @@ likwid-setFrequencies \- print and manage the clock frequency of CPU cores
 .B likwid-setFrequencies 
 .RB [\-hpl]
 .RB [ \-c
-.IR cpu_list,_socket_list_or_expression ]
+.IR <cpu_list,_socket_list_or_expression> ]
 .RB [ \-g
-.IR governor ]
+.IR <governor> ]
 .RB [ \-f
-.IR frequency ]
+.IR <frequency> ]
 .SH DESCRIPTION
 .B likwid-setFrequencies
 is a command line application to set the clock frequency of CPU cores. Since only priviledged users are allowed to change the frequency of CPU cores, the application works in combination with a daemon 
-.B likwid-setFreq
-. The daemon needs the suid permission bit to be set in order to manipulate the sysfs entries. With 
+.B likwid-setFreq.
+The daemon needs the suid permission bit to be set in order to manipulate the sysfs entries. With 
 .B likwid-setFrequencies
 the clock of all cores inside a the cpu_list or affinity domain can be set to a specific frequency or governor at once.
 .SH OPTIONS
@@ -28,14 +28,14 @@ prints the current frequencies for all CPU cores
 .B \-l
 prints all configurable frequencies
 .TP
-.B \-c
+.B \-\^c " <cpu_list,_socket_list_or_expression>
 set the affinity domain where to set the frequencies. Common are N (Node), SX (Socket X), CX (Cache Group X) and MX (Memory Group X). For detailed information about affinity domains see
 .B likwid-pin(1)
 .TP
-.B \-g
+.B \-\^g " <governor>
 set the governor of all CPU cores inside the affinity domain. Current governors are ondemand, performance, turbo. Default is ondemand
 .TP
-.B \-f
+.B \-\^f " <frequency>
 set a fixed frequency at all CPU cores inside the affinity domain. Implicitly sets userspace governor for the cores.
 
 .SH AUTHOR
diff --git a/doc/likwid-topology.1 b/doc/likwid-topology.1
index c3a0316..64bc8b4 100644
--- a/doc/likwid-topology.1
+++ b/doc/likwid-topology.1
@@ -5,14 +5,13 @@ likwid-topology \- print thread and cache topology
 .B likwid-topology 
 .RB [\-hvgcC]
 .RB [ \-o
-.IR output_file ]
+.IR <filename> ]
 .SH DESCRIPTION
 .B likwid-topology
-is a command line application to print the thread and cache
-topology on multicore x86 processors. Used with mono spaced fonts it can
+is a command line application to print the thread and cache topology on multicore x86 processors. Used with mono spaced fonts it can
 draw the processor topology of a machine in ASCII art. Beyond topology
-likwid-topology determines the clock of a processor and prints detailed
-informations about the caches hierarchy and NUMA structure.
+.B likwid-topology
+determines the clock of a processor and prints detailed informations about the caches hierarchy and NUMA structure.
 .SH OPTIONS
 .TP
 .B \-v
@@ -28,9 +27,10 @@ prints topology information in ASCII art. Best viewed with monospaced font.
 prints detailed informations about cache hierarchy
 .TP
 .B \-C
-measures and output the processor clock. This involves a longer runtime of likwid-topology.
+measures and output the processor clock. This involves a longer runtime of
+.B likwid-topology.
 .TP
-.B \-o
+.B \-\^f " <filename>
 Specify output file for topology information. According to the file suffix, the information
 is converted using converter scripts installed at <PREFIX>/share/likwid
 
diff --git a/filters/csv b/filters/csv
index 626916b..654f204 100755
--- a/filters/csv
+++ b/filters/csv
@@ -67,8 +67,9 @@ if ($fileType eq 'topology') {
         } elsif ($region eq 'numa') {
             if (/Domain ([0-9]*)/) {
                 print OUTFILE 'Domain ID'.$SEP.$1.$NL;
-            } elsif (/Memory:.*total ([0-9.]+) MB/) {
-                print OUTFILE 'Memory [MB]'.$SEP.$1.$NL;
+            } elsif (/Memory: ([0-9.]+) MB free of total ([0-9.]+) MB/) {
+                print OUTFILE 'Free Memory [MB]'.$SEP.$1.$NL;
+                print OUTFILE 'Total Memory [MB]'.$SEP.$2.$NL;
             } elsif (/(.*):\t*[ ]*(.*)/) {
                 print OUTFILE $1.$SEP.$2.$NL;
             }
diff --git a/filters/xml b/filters/xml
index 23eaf8e..b72c430 100755
--- a/filters/xml
+++ b/filters/xml
@@ -22,6 +22,7 @@ open OUTFILE,"> $filename";
 
 if ($fileType eq 'topology') {
     my $region = 'topo';
+    my $indomain = 0;
     print OUTFILE '<node>'.$NL;
 
     while (<INFILE>) {
@@ -36,13 +37,15 @@ if ($fileType eq 'topology') {
         }
 
         if ($region eq 'topo') {
-            if (/(CPU type):\t(.*)/) {
+            if (/(CPU type):\t([\w ]*)/) {
                 print OUTFILE '<cpu>'.$2.'</cpu>'.$NL;
-            } elsif (/(Sockets):\t(.*)/) {
+            } elsif (/CPU clock:\t([\d.]) GHz/) {
+                print OUTFILE '<clock>'.$1.'</clock>'.$NL;
+            } elsif (/(Sockets):\t(\d*)/) {
                 print OUTFILE '<socketsPerNode>'.$2.'</socketsPerNode>'.$NL;
-            } elsif (/(Cores per socket):\t(.*)/) {
+            } elsif (/(Cores per socket):\t(\d*)/) {
                 print OUTFILE '<coresPerSocket>'.$2.'</coresPerSocket>'.$NL;
-            } elsif (/(Threads per core):\t(.*)/) {
+            } elsif (/(Threads per core):\t(\d*)/) {
                 print OUTFILE '<threadsPerCore>'.$2.'</threadsPerCore>'.$NL;
             } elsif (/([0-9]*)\t\t([0-9]*)\t\t([0-9]*)\t\t([0-9]*)/) {
                 #TODO Build tree for XML output from table!
@@ -68,15 +71,25 @@ if ($fileType eq 'topology') {
             }
         } elsif ($region eq 'numa') {
             if (/Domain ([0-9]*)/) {
+                if ($indomain )
+                {
+                    print OUTFILE '</domain>'.$NL;
+                }
                 print OUTFILE '<domain>'.$NL;
                 print OUTFILE '<id>'.$1.'</id>'.$NL;
-            } elsif (/Memory:.*total ([0-9.]+) MB/) {
-                print OUTFILE '<memory>'.$1.'</memory>'.$NL;
+                $indomain = 1
+            } elsif (/Memory: ([0-9.]+) MB free of total ([0-9.]+) MB/) {
+                print OUTFILE '<freememory>'.$1.'</freememory>'.$NL;
+                print OUTFILE '<totalmemory>'.$2.'</totalmemory>'.$NL;
             } elsif (/Processors:[ ]+([0-9. ]+)/) {
                 print OUTFILE '<processors>'.$1.'</processors>'.$NL;
             }
         }
     }
+    if ($indomain)
+    {
+        print OUTFILE '</domain>'.$NL;
+    }
 
     print OUTFILE '</numa>'.$NL;
     print OUTFILE '</node>'.$NL;
@@ -117,7 +130,7 @@ if ($fileType eq 'topology') {
     die "Filter failed! Unknown application type $fileType!\n";
 }
 
-unlink($infile);
+#unlink($infile);
 close INFILE;
 close OUTFILE;
 
diff --git a/groups/core2/BRANCH.txt b/groups/core2/BRANCH.txt
index 15a9ae0..2515d6c 100644
--- a/groups/core2/BRANCH.txt
+++ b/groups/core2/BRANCH.txt
@@ -3,12 +3,14 @@ SHORT Branch prediction miss rate/ratio
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  BR_INST_RETIRED_ANY
 PMC1  BR_INST_RETIRED_MISPRED
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 Branch rate   PMC0/FIXC0
 Branch misprediction rate  PMC1/FIXC0
diff --git a/groups/core2/CACHE.txt b/groups/core2/CACHE.txt
index 26e310c..fd2af0c 100644
--- a/groups/core2/CACHE.txt
+++ b/groups/core2/CACHE.txt
@@ -3,12 +3,14 @@ SHORT Data cache miss rate/ratio
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPL
 PMC1  L1D_ALL_CACHE_REF
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 Data cache misses PMC0
 Data cache request rate PMC1/FIXC0
diff --git a/groups/core2/DATA.txt b/groups/core2/DATA.txt
index af77c1e..c48ad99 100644
--- a/groups/core2/DATA.txt
+++ b/groups/core2/DATA.txt
@@ -3,12 +3,14 @@ SHORT Load to store ratio
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  INST_RETIRED_LOADS
 PMC1  INST_RETIRED_STORES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 Load to Store ratio PMC0/PMC1
 
diff --git a/groups/core2/FLOPS_DP.txt b/groups/core2/FLOPS_DP.txt
index 81e30b3..8e72f07 100644
--- a/groups/core2/FLOPS_DP.txt
+++ b/groups/core2/FLOPS_DP.txt
@@ -3,12 +3,14 @@ SHORT Double Precision MFlops/s
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  SIMD_COMP_INST_RETIRED_PACKED_DOUBLE
 PMC1  SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 DP MFlops/s    1.0E-06*(PMC0*2.0+PMC1)/time
 
diff --git a/groups/core2/FLOPS_SP.txt b/groups/core2/FLOPS_SP.txt
index 92c95bb..acd2df7 100644
--- a/groups/core2/FLOPS_SP.txt
+++ b/groups/core2/FLOPS_SP.txt
@@ -3,12 +3,14 @@ SHORT Single Precision MFlops/s
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  SIMD_COMP_INST_RETIRED_PACKED_SINGLE
 PMC1  SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 SP MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
 
diff --git a/groups/core2/FLOPS_X87.txt b/groups/core2/FLOPS_X87.txt
index 1bcd4d6..052356e 100644
--- a/groups/core2/FLOPS_X87.txt
+++ b/groups/core2/FLOPS_X87.txt
@@ -3,11 +3,13 @@ SHORT X87 MFlops/s
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  X87_OPS_RETIRED_ANY
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 X87 MFlops/s  1.0E-06*PMC0/time
 
diff --git a/groups/core2/L2.txt b/groups/core2/L2.txt
index 8436400..88c75c5 100644
--- a/groups/core2/L2.txt
+++ b/groups/core2/L2.txt
@@ -3,12 +3,14 @@ SHORT L2 cache bandwidth in MBytes/s
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPL
 PMC1  L1D_M_EVICT
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L2 load [MBytes/s] 1.0E-06*PMC0*64.0/time
 L2 evict [MBytes/s] 1.0E-06*PMC1*64.0/time
diff --git a/groups/core2/L2CACHE.txt b/groups/core2/L2CACHE.txt
index dbbed5d..34c607a 100644
--- a/groups/core2/L2CACHE.txt
+++ b/groups/core2/L2CACHE.txt
@@ -3,12 +3,14 @@ SHORT L2 cache miss rate/ratio
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L2_RQSTS_THIS_CORE_ALL_MESI
 PMC1  L2_RQSTS_SELF_I_STATE
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L2 request rate PMC0/FIXC0
 L2 miss rate PMC1/FIXC0
diff --git a/groups/core2/MEM.txt b/groups/core2/MEM.txt
index 8f193d6..b205dc4 100644
--- a/groups/core2/MEM.txt
+++ b/groups/core2/MEM.txt
@@ -3,11 +3,13 @@ SHORT Main memory bandwidth in MBytes/s
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
 Memory data volume [GBytes] 1.0E-09*PMC0*64.0
diff --git a/groups/core2/TLB.txt b/groups/core2/TLB.txt
index f36abfe..d536d88 100644
--- a/groups/core2/TLB.txt
+++ b/groups/core2/TLB.txt
@@ -3,12 +3,14 @@ SHORT TLB miss rate/ratio
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  DTLB_MISSES_ANY
 PMC1  L1D_ALL_CACHE_REF
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 DTLB request rate    PMC1/FIXC0
 DTLB miss rate    PMC0/FIXC0
diff --git a/groups/haswell/ENERGY.txt b/groups/haswell/ENERGY.txt
index 039563c..15b1c45 100644
--- a/groups/haswell/ENERGY.txt
+++ b/groups/haswell/ENERGY.txt
@@ -6,6 +6,8 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
 
 METRICS
 Runtime (RDTSC) [s] time 
@@ -15,11 +17,18 @@ CPI  FIXC1/FIXC0
 Temperature [C]  TMP0
 Energy [J]  PWR0
 Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
 
 LONG
 Formula:
 Power =  PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
 -
 Haswell implements the new RAPL interface. This interface enables to
-monitor the consumed energy on the package (socket) level.
+monitor the consumed energy on the package (socket) and DRAM level.
+The PP0 energy domain is often refered to an integrated GPU.
 
diff --git a/groups/haswell/ICACHE.txt b/groups/haswell/ICACHE.txt
new file mode 100644
index 0000000..6ce3ce8
--- /dev/null
+++ b/groups/haswell/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L2 miss rate  = ICACHE_MISSES / INSTR_RETIRED_ANY
+L2 miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/haswell/L2.txt b/groups/haswell/L2.txt
new file mode 100644
index 0000000..47d8ec7
--- /dev/null
+++ b/groups/haswell/L2.txt
@@ -0,0 +1,33 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s]  1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes]  1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
+the L1 data cache to the L2 cache.
diff --git a/groups/haswell/L2CACHE.txt b/groups/haswell/L2CACHE.txt
index 3d7c36e..8186f69 100644
--- a/groups/haswell/L2CACHE.txt
+++ b/groups/haswell/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_TRANS_ALL_REQUESTS
+PMC0  L2_RQSTS_REFERENCES
 PMC1  L2_RQSTS_MISS
 
 METRICS
@@ -18,9 +18,9 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_TRANS_ALL_REQUESTS / INSTR_RETIRED_ANY
+L2 request rate = L2_RQSTS_REFERENCES / INSTR_RETIRED_ANY
 L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_TRANS_ALL_REQUESTS
+L2 miss ratio = L2_RQSTS_MISS / L2_RQSTS_REFERENCES
 -
 This group measures the locality of your data accesses with regard to the
 L2 Cache. L2 request rate tells you how data intensive your code is
diff --git a/groups/haswell/L2CACHE.txt b/groups/haswell/L3CACHE.txt
similarity index 53%
copy from groups/haswell/L2CACHE.txt
copy to groups/haswell/L3CACHE.txt
index 3d7c36e..d4fd89e 100644
--- a/groups/haswell/L2CACHE.txt
+++ b/groups/haswell/L3CACHE.txt
@@ -1,32 +1,32 @@
-SHORT L2 cache miss rate/ratio
+SHORT L3 cache miss rate/ratio
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_TRANS_ALL_REQUESTS
-PMC1  L2_RQSTS_MISS
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 request rate PMC0/FIXC0
-L2 miss rate PMC1/FIXC0
-L2 miss ratio PMC1/PMC0
+L3 request rate (PMC0)/FIXC0
+L3 miss rate PMC1/FIXC0
+L3 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_TRANS_ALL_REQUESTS / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_TRANS_ALL_REQUESTS
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY
+L3 miss rate  = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL
 -
 This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
+L3 Cache. L3 request rate tells you how data intensive your code is
 or how many Data accesses you have in average per instruction.
-The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
+The L3 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L3 miss ratio tells you how many of your
 memory references required a cacheline to be loaded from a higher level.
 While the Data cache miss rate might be given by your algorithm you should
 try to get Data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/haswell/TLB.txt b/groups/haswell/TLB.txt
deleted file mode 100644
index 78bf096..0000000
--- a/groups/haswell/TLB.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-SHORT  TLB miss rate/ratio
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-L1 DTLB miss rate  PMC0/FIXC0
-
-LONG
-Formulas:
-DTLB miss rate  LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
--
-The DTLB miss  rate gives a measure how often a TLB miss occured
-per instruction. 
-
diff --git a/groups/haswell/TLB_DATA.txt b/groups/haswell/TLB_DATA.txt
new file mode 100644
index 0000000..2f59772
--- /dev/null
+++ b/groups/haswell/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration PMC2
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration PMC3
+
+LONG
+Formulas:
+L1 DTLB load misses     DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate  DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
+L1 DTLB store misses     DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate  DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/haswell/TLB_INSTR.txt b/groups/haswell/TLB_INSTR.txt
new file mode 100644
index 0000000..f95f78a
--- /dev/null
+++ b/groups/haswell/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration PMC1
+
+
+LONG
+Formulas:
+L1 ITLB misses     ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate  ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+-
+The ITLB miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/ivybridge/ENERGY.txt b/groups/ivybridge/ENERGY.txt
index 4646bf5..3f70077 100644
--- a/groups/ivybridge/ENERGY.txt
+++ b/groups/ivybridge/ENERGY.txt
@@ -6,6 +6,7 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
 PWR3  PWR_DRAM_ENERGY
 
 METRICS
@@ -16,12 +17,15 @@ CPI  FIXC1/FIXC0
 Temperature [C]  TMP0
 Energy [J]  PWR0
 Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
 Formula:
 Power =  PWR_PKG_ENERGY / time
+Power PP0 [W] PWR1/time
 Power DRAM = PWR_DRAM_ENERGY / time
 -
 IvyBridge implements the new RAPL interface. This interface enables to
diff --git a/groups/ivybridge/FLOPS_AVX.txt b/groups/ivybridge/FLOPS_AVX.txt
index 2bc99ea..e8074c1 100644
--- a/groups/ivybridge/FLOPS_AVX.txt
+++ b/groups/ivybridge/FLOPS_AVX.txt
@@ -12,8 +12,8 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP 32b packed MFlops/s  1.0E-06*(PMC0*8.0)/time
-DP 32b packed MFlops/s  1.0E-06*(PMC1*4.0)/time
+32b packed SP MFlops/s  1.0E-06*(PMC0*8.0)/time
+32b packed DP MFlops/s  1.0E-06*(PMC1*4.0)/time
 
 LONG
 Formula:
diff --git a/groups/ivybridge/FLOPS_DP.txt b/groups/ivybridge/FLOPS_DP.txt
index 88509c9..1e47b50 100644
--- a/groups/ivybridge/FLOPS_DP.txt
+++ b/groups/ivybridge/FLOPS_DP.txt
@@ -13,7 +13,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
 AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
 Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
diff --git a/groups/ivybridge/ICACHE.txt b/groups/ivybridge/ICACHE.txt
new file mode 100644
index 0000000..6ce3ce8
--- /dev/null
+++ b/groups/ivybridge/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L2 miss rate  = ICACHE_MISSES / INSTR_RETIRED_ANY
+L2 miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/haswell/L2CACHE.txt b/groups/ivybridge/L3CACHE.txt
similarity index 53%
copy from groups/haswell/L2CACHE.txt
copy to groups/ivybridge/L3CACHE.txt
index 3d7c36e..d4fd89e 100644
--- a/groups/haswell/L2CACHE.txt
+++ b/groups/ivybridge/L3CACHE.txt
@@ -1,32 +1,32 @@
-SHORT L2 cache miss rate/ratio
+SHORT L3 cache miss rate/ratio
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_TRANS_ALL_REQUESTS
-PMC1  L2_RQSTS_MISS
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 request rate PMC0/FIXC0
-L2 miss rate PMC1/FIXC0
-L2 miss ratio PMC1/PMC0
+L3 request rate (PMC0)/FIXC0
+L3 miss rate PMC1/FIXC0
+L3 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_TRANS_ALL_REQUESTS / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_TRANS_ALL_REQUESTS
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY
+L3 miss rate  = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL
 -
 This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
+L3 Cache. L3 request rate tells you how data intensive your code is
 or how many Data accesses you have in average per instruction.
-The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
+The L3 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L3 miss ratio tells you how many of your
 memory references required a cacheline to be loaded from a higher level.
 While the Data cache miss rate might be given by your algorithm you should
 try to get Data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/ivybridge/MEM.txt b/groups/ivybridge/MEM.txt
index 6632cd4..1f9ff4a 100644
--- a/groups/ivybridge/MEM.txt
+++ b/groups/ivybridge/MEM.txt
@@ -5,23 +5,23 @@ FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 MBOX0C0 CAS_COUNT_RD
-MBOX1C0 CAS_COUNT_WR
-MBOX0C1 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
 MBOX1C1 CAS_COUNT_WR
-MBOX0C2 CAS_COUNT_RD
-MBOX1C2 CAS_COUNT_WR
-MBOX0C3 CAS_COUNT_RD
-MBOX1C3 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0
+Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
 
 LONG
 Profiling group to measure main memory bandwidth drawn by all cores of
diff --git a/groups/ivybridge/MEM_DP.txt b/groups/ivybridge/MEM_DP.txt
index 2e4138e..7bc76cd 100644
--- a/groups/ivybridge/MEM_DP.txt
+++ b/groups/ivybridge/MEM_DP.txt
@@ -4,25 +4,27 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
 PWR3  PWR_DRAM_ENERGY
 PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
 PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
 PMC2  SIMD_FP_256_PACKED_DOUBLE
 MBOX0C0 CAS_COUNT_RD
-MBOX1C0 CAS_COUNT_WR
-MBOX0C1 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
 MBOX1C1 CAS_COUNT_WR
-MBOX0C2 CAS_COUNT_RD
-MBOX1C2 CAS_COUNT_WR
-MBOX0C3 CAS_COUNT_RD
-MBOX1C3 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
+Temperature TMP0
 Energy [J]  PWR0
 Power [W] PWR0/time
 Energy DRAM [J]  PWR3
@@ -31,10 +33,10 @@ AVX MFlops/s  1.0E-06*(4.0*PMC2)/time
 MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0
+Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
 
 LONG
 Formula:
@@ -42,6 +44,8 @@ Power =  PWR_PKG_ENERGY / runtime
 Power DRAM = PWR_DRAM_ENERGY / runtime
 MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
 AVX MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
+Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
 --
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
 Since this group is based on uncore events it is only possible to measure on
diff --git a/groups/ivybridge/MEM_SP.txt b/groups/ivybridge/MEM_SP.txt
index d06f263..4388cc4 100644
--- a/groups/ivybridge/MEM_SP.txt
+++ b/groups/ivybridge/MEM_SP.txt
@@ -11,13 +11,13 @@ PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
 PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
 PMC2  SIMD_FP_256_PACKED_SINGLE
 MBOX0C0 CAS_COUNT_RD
-MBOX1C0 CAS_COUNT_WR
-MBOX0C1 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
 MBOX1C1 CAS_COUNT_WR
-MBOX0C2 CAS_COUNT_RD
-MBOX1C2 CAS_COUNT_WR
-MBOX0C3 CAS_COUNT_RD
-MBOX1C3 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -33,10 +33,10 @@ AVX MFlops/s  1.0E-06*(8.0*PMC2)/time
 MFlops/s  1.0E-06*(PMC0*4.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0
+Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
 
 LONG
 Formula:
@@ -44,6 +44,8 @@ Power =  PWR_PKG_ENERGY / runtime
 Power DRAM = PWR_DRAM_ENERGY / runtime
 MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE * 4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE) / runtime
 AVX MFlops/s =  (SIMD_FP_256_PACKED_SINGLE * 8) / runtime
+Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
+Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
 --
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
 Since this group is based on uncore events it is only possible to measure on
diff --git a/groups/ivybridge/TLB.txt b/groups/ivybridge/TLB.txt
deleted file mode 100644
index 83f0e24..0000000
--- a/groups/ivybridge/TLB.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-SHORT  TLB miss rate/ratio
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-L1 DTLB miss rate  PMC0/FIXC0
-
-LONG
-Formulas:
-DTLB miss rate  LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
--
-The DTLB miss rate gives a measure how often a TLB miss occured per instruction
-in average. TLB misses increase if many pages (4kB data chunks) are accessed in
-a short time period.
-
diff --git a/groups/ivybridge/TLB_DATA.txt b/groups/ivybridge/TLB_DATA.txt
new file mode 100644
index 0000000..2f59772
--- /dev/null
+++ b/groups/ivybridge/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration PMC2
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration PMC3
+
+LONG
+Formulas:
+L1 DTLB load misses     DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate  DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
+L1 DTLB store misses     DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate  DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/ivybridge/TLB_INSTR.txt b/groups/ivybridge/TLB_INSTR.txt
new file mode 100644
index 0000000..f95f78a
--- /dev/null
+++ b/groups/ivybridge/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration PMC1
+
+
+LONG
+Formulas:
+L1 ITLB misses     ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate  ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+-
+The ITLB miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/sandybridge/ENERGY.txt b/groups/sandybridge/ENERGY.txt
index b9a0491..9261934 100644
--- a/groups/sandybridge/ENERGY.txt
+++ b/groups/sandybridge/ENERGY.txt
@@ -6,6 +6,7 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
 PWR3  PWR_DRAM_ENERGY
 
 METRICS
@@ -15,13 +16,16 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 Temperature [C]  TMP0
 Energy [J]  PWR0
-Energy DRAM [J]  PWR3
 Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
 Formula:
 Power =  PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
 -
 SandyBridge implements the new RAPL interface. This interface enables to
diff --git a/groups/sandybridge/FLOPS_DP.txt b/groups/sandybridge/FLOPS_DP.txt
index ef1a0e8..cda580a 100644
--- a/groups/sandybridge/FLOPS_DP.txt
+++ b/groups/sandybridge/FLOPS_DP.txt
@@ -13,7 +13,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
 32b AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
 Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
diff --git a/groups/haswell/L2CACHE.txt b/groups/sandybridge/L3CACHE.txt
similarity index 53%
copy from groups/haswell/L2CACHE.txt
copy to groups/sandybridge/L3CACHE.txt
index 3d7c36e..d4fd89e 100644
--- a/groups/haswell/L2CACHE.txt
+++ b/groups/sandybridge/L3CACHE.txt
@@ -1,32 +1,32 @@
-SHORT L2 cache miss rate/ratio
+SHORT L3 cache miss rate/ratio
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_TRANS_ALL_REQUESTS
-PMC1  L2_RQSTS_MISS
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 request rate PMC0/FIXC0
-L2 miss rate PMC1/FIXC0
-L2 miss ratio PMC1/PMC0
+L3 request rate (PMC0)/FIXC0
+L3 miss rate PMC1/FIXC0
+L3 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_TRANS_ALL_REQUESTS / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_TRANS_ALL_REQUESTS
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY
+L3 miss rate  = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL
 -
 This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
+L3 Cache. L3 request rate tells you how data intensive your code is
 or how many Data accesses you have in average per instruction.
-The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
+The L3 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L3 miss ratio tells you how many of your
 memory references required a cacheline to be loaded from a higher level.
 While the Data cache miss rate might be given by your algorithm you should
 try to get Data cache miss ratio as low as possible by increasing your cache reuse.
diff --git a/groups/sandybridge/MEM.txt b/groups/sandybridge/MEM.txt
index 6632cd4..1f9ff4a 100644
--- a/groups/sandybridge/MEM.txt
+++ b/groups/sandybridge/MEM.txt
@@ -5,23 +5,23 @@ FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 MBOX0C0 CAS_COUNT_RD
-MBOX1C0 CAS_COUNT_WR
-MBOX0C1 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
 MBOX1C1 CAS_COUNT_WR
-MBOX0C2 CAS_COUNT_RD
-MBOX1C2 CAS_COUNT_WR
-MBOX0C3 CAS_COUNT_RD
-MBOX1C3 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0
+Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
 
 LONG
 Profiling group to measure main memory bandwidth drawn by all cores of
diff --git a/groups/sandybridge/MEM_DP.txt b/groups/sandybridge/MEM_DP.txt
index 2891a45..78fbd18 100644
--- a/groups/sandybridge/MEM_DP.txt
+++ b/groups/sandybridge/MEM_DP.txt
@@ -11,13 +11,13 @@ PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
 PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
 PMC2  SIMD_FP_256_PACKED_DOUBLE
 MBOX0C0 CAS_COUNT_RD
-MBOX1C0 CAS_COUNT_WR
-MBOX0C1 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
 MBOX1C1 CAS_COUNT_WR
-MBOX0C2 CAS_COUNT_RD
-MBOX1C2 CAS_COUNT_WR
-MBOX0C3 CAS_COUNT_RD
-MBOX1C3 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -33,10 +33,10 @@ MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
 32b AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
 Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0
+Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
 
 LONG
 Formula:
@@ -44,6 +44,8 @@ Power =  PWR_PKG_ENERGY / runtime
 Power DRAM = PWR_DRAM_ENERGY / runtime
 MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
 AVX MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
+Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
 --
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
 Since this group is based on uncore events it is only possible to measure on
diff --git a/groups/sandybridge/MEM_SP.txt b/groups/sandybridge/MEM_SP.txt
index 9ac34d0..1ede713 100644
--- a/groups/sandybridge/MEM_SP.txt
+++ b/groups/sandybridge/MEM_SP.txt
@@ -11,13 +11,13 @@ PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
 PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
 PMC2  SIMD_FP_256_PACKED_DOUBLE
 MBOX0C0 CAS_COUNT_RD
-MBOX1C0 CAS_COUNT_WR
-MBOX0C1 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
 MBOX1C1 CAS_COUNT_WR
-MBOX0C2 CAS_COUNT_RD
-MBOX1C2 CAS_COUNT_WR
-MBOX0C3 CAS_COUNT_RD
-MBOX1C3 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -33,10 +33,10 @@ MFlops/s  1.0E-06*(PMC0*4.0+PMC1)/time
 32b AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
 Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0
+Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
 
 LONG
 Formula:
@@ -44,6 +44,8 @@ Power =  PWR_PKG_ENERGY / runtime
 Power DRAM = PWR_DRAM_ENERGY / runtime
 MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE * 4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE) / runtime
 AVX MFlops/s =  (SIMD_FP_256_PACKED_SINGLE * 8) / runtime
+Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
+Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
 --
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
 Since this group is based on uncore events it is only possible to measure on
diff --git a/groups/sandybridge/TLB.txt b/groups/sandybridge/TLB.txt
deleted file mode 100644
index 83f0e24..0000000
--- a/groups/sandybridge/TLB.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-SHORT  TLB miss rate/ratio
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-L1 DTLB miss rate  PMC0/FIXC0
-
-LONG
-Formulas:
-DTLB miss rate  LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
--
-The DTLB miss rate gives a measure how often a TLB miss occured per instruction
-in average. TLB misses increase if many pages (4kB data chunks) are accessed in
-a short time period.
-
diff --git a/groups/sandybridge/TLB_DATA.txt b/groups/sandybridge/TLB_DATA.txt
new file mode 100644
index 0000000..2f59772
--- /dev/null
+++ b/groups/sandybridge/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration PMC2
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration PMC3
+
+LONG
+Formulas:
+L1 DTLB load misses     DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate  DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
+L1 DTLB store misses     DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate  DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/sandybridge/TLB_INSTR.txt b/groups/sandybridge/TLB_INSTR.txt
new file mode 100644
index 0000000..f95f78a
--- /dev/null
+++ b/groups/sandybridge/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration PMC1
+
+
+LONG
+Formulas:
+L1 ITLB misses     ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate  ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+-
+The ITLB miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/core2/BRANCH.txt b/groups/silvermont/BRANCH.txt
similarity index 51%
copy from groups/core2/BRANCH.txt
copy to groups/silvermont/BRANCH.txt
index 15a9ae0..cbaf834 100644
--- a/groups/core2/BRANCH.txt
+++ b/groups/silvermont/BRANCH.txt
@@ -3,12 +3,14 @@ SHORT Branch prediction miss rate/ratio
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
-PMC0  BR_INST_RETIRED_ANY
-PMC1  BR_INST_RETIRED_MISPRED
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 Branch rate   PMC0/FIXC0
 Branch misprediction rate  PMC1/FIXC0
@@ -17,12 +19,13 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ANY / INSTR_RETIRED_ANY
-Branch misprediction rate = BR_INST_RETIRED_MISPRED / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_INST_RETIRED_MISPRED / BR_INST_RETIRED_ANY
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ANY
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The Branch misprediction ratio sets directly
-into relation what ration of all branch instruction where mispredicted.
+into relation what ratio of all branch instruction where mispredicted.
 Instructions per branch is 1/Branch rate.
+
diff --git a/groups/haswell/ENERGY.txt b/groups/silvermont/ENERGY.txt
similarity index 81%
copy from groups/haswell/ENERGY.txt
copy to groups/silvermont/ENERGY.txt
index 039563c..5646a9a 100644
--- a/groups/haswell/ENERGY.txt
+++ b/groups/silvermont/ENERGY.txt
@@ -8,7 +8,7 @@ TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
 
 METRICS
-Runtime (RDTSC) [s] time 
+Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
@@ -20,6 +20,6 @@ LONG
 Formula:
 Power =  PWR_PKG_ENERGY / time
 -
-Haswell implements the new RAPL interface. This interface enables to
+Silvermont implements the new RAPL interface. This interface enables to
 monitor the consumed energy on the package (socket) level.
 
diff --git a/groups/silvermont/ICACHE.txt b/groups/silvermont/ICACHE.txt
new file mode 100644
index 0000000..6ce3ce8
--- /dev/null
+++ b/groups/silvermont/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L2 miss rate  = ICACHE_MISSES / INSTR_RETIRED_ANY
+L2 miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/silvermont/L1TOL2.txt b/groups/silvermont/L1TOL2.txt
new file mode 100644
index 0000000..225533d
--- /dev/null
+++ b/groups/silvermont/L1TOL2.txt
@@ -0,0 +1,28 @@
+SHORT L2 load cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_L1_MISS_LOADS 
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0)*64.0
+
+LONG
+Formulas:
+L2 Load [MBytes/s] = 1.0E-06*MEM_UOPS_RETIRED_L1_MISS_LOADS*64/time
+L2 bandwidth [MBytes/s] = 1.0E-06*(MEM_UOPS_RETIRED_L1_MISS_LOADS)*64/time
+L2 data volume [GBytes] = 1.0E-09*(MEM_UOPS_RETIRED_L1_MISS_LOADS)*64
+-
+Profiling group to measure L2 load cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L1 cache. Since there is no possibility to retrieve
+the evicted cache lines, this group measures only the load cache bandwidth.
+The group also output totally loaded data volume transfered between L2 and L1.
+
diff --git a/groups/silvermont/L2TOMEM.txt b/groups/silvermont/L2TOMEM.txt
new file mode 100644
index 0000000..bc4cbed
--- /dev/null
+++ b/groups/silvermont/L2TOMEM.txt
@@ -0,0 +1,26 @@
+SHORT L2 to Mem load cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_L2_MISS_LOADS 
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to MEM load bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
+L2 to MEM load data volume [GBytes] 1.0E-09*(PMC0)*64.0
+
+LONG
+Formulas:
+L2 to MEM load bandwidth [MBytes/s] = 1.0E-06*(MEM_UOPS_RETIRED_L2_MISS_LOADS)*64/time
+L2 to MEM load data volume [GBytes] = 1.0E-09*(MEM_UOPS_RETIRED_L2_MISS_LOADS)*64
+-
+Profiling group to measure L2 to MEM load cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L2 cache. Since there is no possibility to retrieve
+the evicted cache lines, this group measures only the load cache bandwidth.
+The group also output totally loaded data volume transfered between memory and L2.
+
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 0000000..fd0ffdf
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,12 @@
+obj-m := enable_rdpmc.o
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+all:
+	$(MAKE) -Wpacked -C $(KERNELDIR) M=$(PWD) modules
+
+modules_install:
+	install -m 666 enable_rdpmc.ko /lib/modules/$(shell uname -r)/extra/
+
+clean:
+	rm -f *.ko *.o modules.order Module.symvers enable_rdpmc.mod.c
diff --git a/kernel/enable_rdpmc.c b/kernel/enable_rdpmc.c
new file mode 100644
index 0000000..0ecc86d
--- /dev/null
+++ b/kernel/enable_rdpmc.c
@@ -0,0 +1,73 @@
+/*  
+ *  Read PMC in kernel mode.
+ */
+#include <linux/module.h>   /* Needed by all modules */
+#include <linux/kernel.h>   /* Needed for KERN_INFO */
+
+#define MODULE_PARAM(type, name, value, desc) \
+    type name = value; \
+    module_param(name, type, 0664); \
+    MODULE_PARM_DESC(name, desc)
+
+MODULE_PARAM(int, debug, 0, "Debug output");
+
+
+static uint64_t printc4(void) {
+    uint64_t output;
+    // Read back CR4 to check the bit.
+    __asm__("\t mov %%cr4,%0" : "=r"(output));
+    return output;
+}
+
+static void setc4b8(void * info) {
+    // Set CR4, Bit 8 (9th bit from the right)  to enable
+    __asm__("push   %rax\n\t"
+            "mov    %cr4,%rax;\n\t"
+            "or     $(1 << 8),%rax;\n\t"
+            "mov    %rax,%cr4;\n\t"
+            "wbinvd\n\t"
+            "pop    %rax"
+    );
+
+    if (debug) {
+        printk(KERN_INFO "Processor %d, RDPMC_ENABLE_BIT=%llu\n", smp_processor_id(), printc4());
+    }
+}
+
+static void clearc4b8(void * info) {
+    printc4();
+    __asm__("push   %rax\n\t"
+            "push   %rbx\n\t"
+            "mov    %cr4,%rax;\n\t"
+            "mov  $(1 << 8), %rbx\n\t"
+            "not  %rbx\n\t"
+            "and   %rbx, %rax;\n\t"
+            "mov    %rax,%cr4;\n\t"
+            "wbinvd\n\t"
+            "pop    %rbx\n\t"
+            "pop    %rax\n\t"
+    );
+    
+    if (debug) {
+        printk(KERN_INFO "Processor %d, RDPMC_ENABLE_BIT=%llu\n", smp_processor_id(), printc4());
+    }
+}
+
+
+
+int start_module(void)
+{
+    on_each_cpu(setc4b8, NULL, 0);
+    return 0;
+}
+void stop_module(void)
+{
+    on_each_cpu(clearc4b8, NULL, 0);
+}
+
+module_init(start_module);
+module_exit(stop_module)
+
+MODULE_AUTHOR("Thomas Roehl <Thomas.Roehl at fau.de>");
+MODULE_DESCRIPTION("Enable RDPMC for userspace");
+MODULE_LICENSE("GPL");
diff --git a/make/include_GCC.mk b/make/include_GCC.mk
index 38606c1..1ccfd88 100644
--- a/make/include_GCC.mk
+++ b/make/include_GCC.mk
@@ -12,7 +12,7 @@ GEN_PMHEADER = ./perl/gen_events.pl
 #ANSI_CFLAGS += -Wextra
 #ANSI_CFLAGS += -Wall
 
-CFLAGS   =  -O2  -Wno-format -std=c99
+CFLAGS   =  -O2  -Wno-format -Wno-nonnull -std=c99
 FCFLAGS  = -module ./  # ifort
 #FCFLAGS  = -J ./  -fsyntax-only  #gfortran
 PASFLAGS  = x86-64
diff --git a/make/include_ICC.mk b/make/include_ICC.mk
index b379daa..ce49bfe 100644
--- a/make/include_ICC.mk
+++ b/make/include_ICC.mk
@@ -7,11 +7,10 @@ GEN_PAS = ./perl/generatePas.pl
 GEN_GROUPS = ./perl/generateGroups.pl 
 GEN_PMHEADER = ./perl/gen_events.pl 
 
-ANSI_CFLAGS  = -strict-ansi
 ANSI_CFLAGS += -std=c99
 
-CFLAGS   =  -O1 -Wno-format -vec-report=0
-FCFLAGS  = -module ./ 
+CFLAGS   =  -O1 -Wno-format
+FCFLAGS  = -module ./
 ASFLAGS  = -gdwarf-2
 PASFLAGS  = x86-64
 CPPFLAGS =
@@ -21,7 +20,6 @@ SHARED_CFLAGS = -fpic
 SHARED_LFLAGS = -shared
 
 DEFINES  = -D_GNU_SOURCE
-DEFINES  += -DMAX_NUM_THREADS=128
 DEFINES  += -DPAGE_ALIGNMENT=4096
 #enable this option to build likwid-bench with marker API for likwid-perfctr
 #DEFINES  += -DPERFMON
diff --git a/perl/generatePas.pl b/perl/generatePas.pl
index 520cbc6..9c1dcd1 100755
--- a/perl/generatePas.pl
+++ b/perl/generatePas.pl
@@ -98,7 +98,7 @@ while (defined(my $file = readdir(DIR))) {
                 }
             } elsif ($line =~ /TYPE[ ]+(SINGLE|DOUBLE)/) {
                 $type = $1;
-            } elsif ($line =~ /FLOPS[ ]+([0-9]+)/) {
+            } elsif ($line =~ /FLOPS[ ]+([0-9.]+)/) {
                 $flops = $1;
             } elsif ($line =~ /BYTES[ ]+([0-9]+)/) {
                 $bytes = $1;
diff --git a/perl/likwid-mpirun b/perl/likwid-mpirun
index fb8daf1..b922359 100755
--- a/perl/likwid-mpirun
+++ b/perl/likwid-mpirun
@@ -1,11 +1,39 @@
 #!/usr/bin/perl
+# =======================================================================================
+#
+#      Filename:  likwid-mpirun
+#
+#      Description:  Wrapper application to mpi startup mechanisms. Builds on
+#                    likwid to control affinity and has integrated perfctr support.
+#
+#      Version:   <VERSION>
+#      Released:  <DATE>
+#
+#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2014 Jan Treibig
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
 
 use Getopt::Long;
 ##############################
 #       CONFIGURATION        #
-##############################       
-my $LIKWIDPIN  = 'likwid-pin';
-my $LIKWIDPERF = 'likwid-perfctr';
+##############################
+my $LIKWIDPIN  = '<PREFIX>/bin/likwid-pin';
+my $LIKWIDPERF = '<PREFIX>/bin/likwid-perfctr';
 my $MPIROOT_openmpi  =  $ENV{'MPIHOME'};
 my $MPIROOT_intelmpi =  $ENV{'MPIHOME'};
 my $MPIEXEC_openmpi  = "$MPIROOT_openmpi/bin/mpiexec";
@@ -425,4 +453,4 @@ if (-e $WrapperScript and not $debug) {
     unlink ($Hostfilename);
 }
 
-# vim: foldmethod=marker foldmarker=#<#,#># 
+# vim: foldmethod=marker foldmarker=#<#,#>#
diff --git a/perl/likwid-setFrequencies b/perl/likwid-setFrequencies
index 8cc2a97..5834441 100755
--- a/perl/likwid-setFrequencies
+++ b/perl/likwid-setFrequencies
@@ -1,4 +1,31 @@
 #!/usr/bin/perl
+# =======================================================================================
+#
+#      Filename:  likwid-setFrequencies
+#
+#      Description:  Application allowing to change core frequencies
+#
+#      Version:   <VERSION>
+#      Released:  <DATE>
+#
+#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2014 Jan Treibig
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
 
 use Getopt::Std;
 
@@ -30,7 +57,7 @@ sub usage
 This script allows to switch governors and set fixed
 frequencies on Linux system.
 
-usage: $0 [-hlp] [-g governor] [-d domain] [-f frequency]
+usage: $0 [-hlp] [-g governor] [-c domain] [-f frequency]
 -h          : this (help) message
 -p          : print current frequencies
 -l          : list available frequencies
@@ -145,10 +172,14 @@ if ($opt{g} eq 'turbo') {
 
 if ($opt{g}) {
     $governor = $opt{g};
-    print "Set governor in domain $domain to $governor \n";
-    foreach my $processID (@processors) {
-#    print "$SYSCMD $processID 0 $governor\n";
-        system("$SYSCMD $processID 0 $governor");
+    if (($governor ne "ondemand") and ($governor ne "performance")) {
+        print "Governor $governor not valid\n";
+    } else {
+        print "Set governor in domain $domain to $governor \n";
+        foreach my $processID (@processors) {
+            system("$SYSCMD $processID 0 $governor");
+        }
     }
 }
 
+# vim: foldmethod=marker foldmarker=#<#,#>#
diff --git a/perl/set_license.pl b/perl/set_license.pl
index 9ce5fda..f80326d 100755
--- a/perl/set_license.pl
+++ b/perl/set_license.pl
@@ -11,8 +11,8 @@ my $fc = '!';
 
 #my $VERSION   = '<VERSION>';
 #my $DATE   = '<DATE>';
-my $VERSION   = '3.1.2';
-my $DATE   = '2.6.2014';
+my $VERSION   = '3.1.3';
+my $DATE   = '4.11.2014';
 my $YEAR  = '2014';
 my $AUTHOR = 'Jan Treibig';
 my $LICENSE = 'gpl';
diff --git a/perl/templates/group.tt b/perl/templates/group.tt
index 43ae7c3..2122caf 100644
--- a/perl/templates/group.tt
+++ b/perl/templates/group.tt
@@ -2,14 +2,65 @@
 
 #define NUM_GROUPS_[% arch FILTER upper %] [% numGroups %]
 
+[% FOREACH group IN groups %]
+static const char* group_names_[% arch FILTER ucfirst %]_[% group.name %] [] = {[% FOREACH metric IN group.metrics %] "[% metric.label %]", [% END %] NULL};
+[% END %]
+
 static PerfmonGroupMap [% arch %]_group_map[NUM_GROUPS_[% arch FILTER upper %]] = {
 [% FOREACH group IN groups %]
-    {"[% group.name %]",[% group.name %],[% group.isUncore %],"[% group.shortHelp %]","[% group.eventSet %]"},
+    {"[% group.name %]",[% group.name %],[% group.isUncore %],"[% group.shortHelp %]","[% group.eventSet %]", 0 [% FOREACH metric IN group.metrics %] +1 [% END %], group_names_[% arch FILTER ucfirst %]_[% group.name %]
+    },
 [% END %]
 };
 
+void perfmon_getDerivedCounterValues[% arch FILTER ucfirst %](PerfmonGroup group, float * values, float * out_max, float * out_min){
+    double time = rdtscTime;
+    double inverseClock = 1.0 /(double) timer_getCpuClock();
+
+    values[0] = time;
+    out_min[0] = time;
+    out_max[0] = time;
+
+    switch ( group ) {
+    [% FOREACH group IN groups %]
+        case [% group.name %]:{
+            int threadId;
+            int counter = 0;
+            double sum,min,max;
+
+        [% FOREACH metric IN group.metrics %]
+            sum = 0;
+            min = 1e300;
+            max = 0;
+
+            for(threadId=0; threadId < perfmon_numThreads; threadId++)
+            {
+                double cur = [% metric.rule %];
+                cur = isnan(cur) ? 0.0 : cur;
+                sum += cur;
+                max = max > cur ? max : cur;
+                min = min < cur ? min : cur;                        
+            }
+
+            values[counter] = (float) sum / perfmon_numThreads;
+            out_min[counter] = (float) min;
+            out_max[counter] = (float) max;
+            counter++;
+        [% END %]
+        return;        
+        }
+    [% END %]
+
+        default:
+            fprintf (stderr, "perfmon_getDerivedCounterValues[% arch %]: Unknown group! Exiting!\n" );
+            exit (EXIT_FAILURE);
+            break;
+    }
+}
+
+
 void
-perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup group)
+perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup groupId)
 {
     int threadId;
     double time = rdtscTime;
@@ -25,7 +76,7 @@ perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup group)
     uint64_t cpi_cyc  = 0;
     int cpi_index = 0;
 
-    switch ( group ) 
+    switch ( groupId ) 
     {
 [% FOREACH group IN groups %]
         case [% group.name %]:
diff --git a/src/access-daemon/Makefile b/src/access-daemon/Makefile
index 1f2b3d9..afd751b 100644
--- a/src/access-daemon/Makefile
+++ b/src/access-daemon/Makefile
@@ -4,8 +4,8 @@
 #
 #      Description:  accessDaemon Makefile
 #
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 #
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
@@ -38,12 +38,13 @@ ifeq ($(COMPILER),GCC)
 CFLAGS    +=  -pedantic -Wall -Wextra -std=c99
 endif
 CPPFLAGS :=  $(DEFINES) $(INCLUDES)
+Q=
 
 all: $(DAEMON_TARGET) $(SETFREQ_TARGET)
 
 $(DAEMON_TARGET): accessDaemon.c
-	$(CC) $(CFLAGS) $(CPPFLAGS) -o ../../$(DAEMON_TARGET) accessDaemon.c
+	$(CC) $(ANSI_CFLAGS) $(CFLAGS) $(CPPFLAGS) -o ../../$(DAEMON_TARGET) accessDaemon.c
 
 $(SETFREQ_TARGET): setFreq.c
-	$(CC) $(CFLAGS) $(CPPFLAGS) -o ../../$(SETFREQ_TARGET) setFreq.c
+	$(CC) $(ANSI_CFLAGS) $(CFLAGS) $(CPPFLAGS) -o ../../$(SETFREQ_TARGET) setFreq.c
 
diff --git a/src/access-daemon/accessDaemon.c b/src/access-daemon/accessDaemon.c
index a1903ab..5679a92 100644
--- a/src/access-daemon/accessDaemon.c
+++ b/src/access-daemon/accessDaemon.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Implementation of access daemon.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Authors:  Michael Meier, michael.meier at rrze.fau.de
  *                Jan Treibig (jt), jan.treibig at gmail.com
@@ -55,34 +55,88 @@
 #define str(x) #x
 
 #define CHECK_ERROR(func, msg)  \
-    if ((func) < 0) { syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); }
+    if ((func) < 0) { \
+        syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
+    }
 
 #define CHECK_FILE_ERROR(func, msg)  \
-    if ((func) == 0) { syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); }
+    if ((func) == 0) { \
+        syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
+    }
 
 
 #define EXIT_IF_ERROR(func, msg)  \
-    if ((func) < 0) { syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); stop_daemon(); exit(EXIT_FAILURE); }
+    if ((func) < 0) { \
+        syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
+        stop_daemon(); \
+        exit(EXIT_FAILURE); \
+    }
 
 
-#define CPUID                    \
-    __asm__ volatile ("cpuid"    \
-            : "=a" (eax),            \
-            "=b" (ebx)             \
+#define CPUID \
+    __asm__ volatile ("cpuid" \
+            : "=a" (eax), "=b" (ebx) \
             : "0" (eax))
 
 
-#define  P6_FAMILY        0x6U
-#define  K8_FAMILY        0xFU
-#define  K10_FAMILY       0x10U
-#define  K15_FAMILY       0x15U
-#define  K16_FAMILY       0x16U
-
+/* Intel P6 */
+#define PENTIUM_M_BANIAS     0x09U
+#define PENTIUM_M_DOTHAN     0x0DU
+#define CORE_DUO             0x0EU
+#define CORE2_65             0x0FU
+#define CORE2_45             0x17U
+#define ATOM                 0x1CU
+#define ATOM_45              0x26U
+#define ATOM_32              0x36U
+#define ATOM_22              0x27U
+#define ATOM_SILVERMONT      0x4DU
+#define NEHALEM              0x1AU
+#define NEHALEM_BLOOMFIELD   0x1AU
+#define NEHALEM_LYNNFIELD    0x1EU
+#define NEHALEM_LYNNFIELD_M  0x1FU
+#define NEHALEM_WESTMERE     0x2CU
+#define NEHALEM_WESTMERE_M   0x25U
 #define SANDYBRIDGE          0x2AU
 #define SANDYBRIDGE_EP       0x2DU
+#define HASWELL              0x3CU
+#define HASWELL_EX           0x3FU
+#define HASWELL_M1           0x45U
+#define HASWELL_M2           0x46U
 #define IVYBRIDGE            0x3AU
 #define IVYBRIDGE_EP         0x3EU
-#define HASWELL              0x3CU
+#define NEHALEM_EX           0x2EU
+#define WESTMERE_EX          0x2FU
+#define XEON_MP              0x1DU
+
+/* Intel MIC */
+#define XEON_PHI           0x01U
+
+/* AMD K10 */
+#define BARCELONA      0x02U
+#define SHANGHAI       0x04U
+#define ISTANBUL       0x08U
+#define MAGNYCOURS     0x09U
+
+/* AMD K8 */
+#define OPTERON_SC_1MB  0x05U
+#define OPTERON_DC_E    0x21U
+#define OPTERON_DC_F    0x41U
+#define ATHLON64_X2     0x43U
+#define ATHLON64_X2_F   0x4BU
+#define ATHLON64_F1     0x4FU
+#define ATHLON64_F2     0x5FU
+#define ATHLON64_X2_G   0x6BU
+#define ATHLON64_G1     0x6FU
+#define ATHLON64_G2     0x7FU
+
+
+#define  P6_FAMILY        0x6U
+#define  MIC_FAMILY       0xBU
+#define  NETBURST_FAMILY  0xFFU
+#define  K15_FAMILY       0x15U
+#define  K16_FAMILY       0x16U
+#define  K10_FAMILY       0x10U
+#define  K8_FAMILY        0xFU
 
 #define PCI_ROOT_PATH    "/proc/bus/pci/"
 #define MAX_PATH_LENGTH   60
@@ -159,6 +213,44 @@ static int allowed_intel(uint32_t reg)
     }
 }
 
+static int allowed_silvermont(uint32_t reg)
+{
+    if ( ((reg & 0x0F8U) == 0x0C0U) ||
+            ((reg & 0xFF0U) == 0x180U) ||
+            ((reg & 0xF00U) == 0x300U) ||
+            ((reg & 0xF00U) == 0x600U) ||
+            ((reg & 0xF00U) == 0xC00U) ||
+            ((reg & 0xF00U) == 0xD00U) ||
+            (reg == 0x1A0)  ||
+            (reg == 0x0CE)  ||
+            (reg == 0x1AD)  ||
+            (reg == 0x19C)  ||
+            (reg == 0x1A2)  ||
+            (reg == 0x1A6) ||
+            (reg == 0x1A6) ||
+            (reg == 0x1A7))
+    {
+        return 1;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+static int allowed_westmereEX(uint32_t reg)
+{
+    if (allowed_intel(reg) == 1)
+    {
+        return 1;
+    }
+    else if ((reg & 0xF00) == 0xF00)
+    {
+        return 1;
+    }
+    return 0;
+}
+
 static int allowed_sandybridge(uint32_t reg)
 {
     if ( ((reg & 0x0F8U) == 0x0C0U) ||
@@ -182,6 +274,30 @@ static int allowed_sandybridge(uint32_t reg)
     }
 }
 
+static int allowed_haswell(uint32_t reg)
+{
+    if ( ((reg & 0x0F8U) == 0x0C0U) ||
+            ((reg & 0xFF0U) == 0x180U) ||
+            ((reg & 0xF00U) == 0x300U) ||
+            ((reg & 0xF00U) == 0xC00U) ||
+            ((reg & 0xF00U) == 0xD00U) ||
+            ((reg & 0xF00U) == 0xE00U) ||
+            ((reg & 0xF00U) == 0x600U) ||
+            ((reg & 0xF00U) == 0x700U) ||
+            (reg == 0x1A0)  ||
+            (reg == 0x0CE)  ||
+            (reg == 0x19C)  ||
+            (reg == 0x1A2)  ||
+            (reg == 0x1AD)  ||
+            (reg == 0x1A6))
+    {
+        return 1;
+    }
+    else
+    {
+        return 0;
+    }
+}
 
 static int allowed_amd(uint32_t reg)
 {
@@ -231,6 +347,11 @@ static void msr_read(AccessDataRecord * dRecord)
     dRecord->errorcode = ERR_NOERROR;
     dRecord->data = 0;
 
+    if (FD_MSR[cpu] == -2)
+    {
+        dRecord->errorcode = ERR_NODEV;
+        return;
+    }
     if (!allowed(reg))
     {
         syslog(LOG_ERR, "attempt to read from restricted register 0x%x", reg);
@@ -256,6 +377,12 @@ static void msr_write(AccessDataRecord * dRecord)
 
     dRecord->errorcode = ERR_NOERROR;
 
+    if (FD_MSR[cpu] == -2)
+    {
+        dRecord->errorcode = ERR_NODEV;
+        return;
+    }
+
     if (!allowed(reg))
     {
         syslog(LOG_ERR, "attempt to write to restricted register %x", reg);
@@ -282,10 +409,10 @@ static void pci_read(AccessDataRecord* dRecord)
     dRecord->data = 0;
 
     if (FD_PCI[socketId][device] == -2)
-	{
-		dRecord->errorcode = ERR_NODEV;
-		return;
-	}
+    {
+        dRecord->errorcode = ERR_NODEV;
+        return;
+    }
     else if ( !FD_PCI[socketId][device] )
     {
         strncpy(pci_filepath, PCI_ROOT_PATH, 30);
@@ -302,7 +429,7 @@ static void pci_read(AccessDataRecord* dRecord)
         }
     }
 
-    if ( pread(FD_PCI[socketId][device], &data, sizeof(data), reg) != sizeof(data)) 
+    if ( pread(FD_PCI[socketId][device], &data, sizeof(data), reg) != sizeof(data))
     {
         syslog(LOG_ERR, "Failed to read data from pci device file on socket %u device %u",
                 socketId, device);
@@ -323,11 +450,11 @@ static void pci_write(AccessDataRecord* dRecord)
     uint32_t data = (uint32_t) dRecord->data;
 
     dRecord->errorcode = ERR_NOERROR;
-	if (FD_PCI[socketId][device] == -2)
-	{
-		dRecord->errorcode = ERR_NODEV;
-		return;
-	}
+    if (FD_PCI[socketId][device] == -2)
+    {
+        dRecord->errorcode = ERR_NODEV;
+        return;
+    }
     else if ( !FD_PCI[socketId][device] )
     {
         strncpy(pci_filepath, PCI_ROOT_PATH, 30);
@@ -344,7 +471,7 @@ static void pci_write(AccessDataRecord* dRecord)
         }
     }
 
-    if (pwrite(FD_PCI[socketId][device], &data, sizeof data, reg) != sizeof data) 
+    if (pwrite(FD_PCI[socketId][device], &data, sizeof data, reg) != sizeof data)
     {
         syslog(LOG_ERR, "Failed to write data to pci device file on socket %u", socketId);
         dRecord->errorcode = ERR_RWFAIL;
@@ -432,7 +559,7 @@ static void daemonize(int* parentPid)
 
     /* Change the current working directory.  This prevents the current
        directory from being locked; hence not being able to remove it. */
-    if ((chdir("/")) < 0) 
+    if ((chdir("/")) < 0)
     {
         syslog(LOG_ERR, "chdir failed:  %s", strerror(errno));
         exit(EXIT_FAILURE);
@@ -458,6 +585,7 @@ int main(void)
     mode_t oldumask;
     uint32_t numHWThreads = sysconf(_SC_NPROCESSORS_CONF);
     uint32_t model;
+    int isIntel = 1;
 
     if (!lock_check())
     {
@@ -465,61 +593,77 @@ int main(void)
         exit(EXIT_FAILURE);
     }
 
+    for ( uint32_t i=0; i < numHWThreads; i++ )
     {
-        uint32_t  eax = 0x00;
-        uint32_t  ebx = 0x00;
-        int isIntel = 1;
-        CPUID;
-        if (ebx == 0x68747541U)
-        {
-            isIntel = 0;
-        }
+        FD_MSR[i] = -1;
+    }
 
-        eax = 0x01;
-        CPUID;
-        uint32_t family = ((eax >> 8) & 0xFU) + ((eax >> 20) & 0xFFU);
-        model  = (((eax >> 16) & 0xFU) << 4) + ((eax >> 4) & 0xFU);
+    uint32_t  eax = 0x00;
+    uint32_t  ebx = 0x00;
+    
+    CPUID;
+    if (ebx == 0x68747541U)
+    {
+        isIntel = 0;
+    }
 
-        switch (family)
-        {
-            case P6_FAMILY:
-                allowed = allowed_intel;
+    eax = 0x01;
+    CPUID;
+    uint32_t family = ((eax >> 8) & 0xFU) + ((eax >> 20) & 0xFFU);
+    model  = (((eax >> 16) & 0xFU) << 4) + ((eax >> 4) & 0xFU);
 
-                if ((model == SANDYBRIDGE)        ||
-                        (model == SANDYBRIDGE_EP) ||
-                        (model == IVYBRIDGE)      ||
-                        (model == IVYBRIDGE_EP) )
-                {
-                    allowed = allowed_sandybridge;
-                    isPCIUncore = 1;
-                }
-                else if (model == HASWELL)
-                {
-                    allowed = allowed_sandybridge;
-                }
-                break;
-            case K8_FAMILY:
-                if (isIntel)
-                {
-                    fprintf(stderr,
-                            "ERROR - [%s:%d] - Netburst architecture is not supported! Exiting! \n",
-                            __FILE__,__LINE__);
-                    exit(EXIT_FAILURE);
-                }
-            case K10_FAMILY:
+    switch (family)
+    {
+        case P6_FAMILY:
+            allowed = allowed_intel;
+
+            if (isIntel && ((model == SANDYBRIDGE)    ||
+                            (model == SANDYBRIDGE_EP) ||
+                            (model == IVYBRIDGE)      ||
+                            (model == IVYBRIDGE_EP) ))
+            {
+                allowed = allowed_sandybridge;
+                isPCIUncore = 1;
+            }
+            else if (isIntel && ((model == HASWELL)    ||
+                                 (model == HASWELL_M1) ||
+                                 (model == HASWELL_M2) ||
+                                 (model == HASWELL_EX)))
+            {
+                allowed = allowed_haswell;
+            }
+            else if (isIntel && (model == ATOM_SILVERMONT))
+            {
+                allowed = allowed_silvermont;
+            }
+            else if (isIntel && (model == WESTMERE_EX))
+            {
+                allowed = allowed_westmereEX;
+            }
+            break;
+        case K8_FAMILY:
+        case K10_FAMILY:
+            if (!isIntel) 
+            {
                 allowed = allowed_amd;
-                break;
-            case K15_FAMILY:
+            }
+            break;
+        case K15_FAMILY:
+            if (!isIntel) 
+            {
                 allowed = allowed_amd15;
-                break;
-            case K16_FAMILY:
+            }
+            break;
+        case K16_FAMILY:
+            if (!isIntel) 
+            {
                 allowed = allowed_amd16;
-	        break;
-            default:
-                fprintf(stderr, "ERROR - [%s:%d] - Unsupported processor. Exiting!  \n",
-                        __FILE__, __LINE__);
-                exit(EXIT_FAILURE);
-        }
+            }
+            break;
+        default:
+            fprintf(stderr, "ERROR - [%s:%d] - Unsupported processor. Exiting!\n",
+                    __FILE__, __LINE__);
+            exit(EXIT_FAILURE);
     }
 
     openlog(ident, 0, LOG_USER);
@@ -593,12 +737,21 @@ int main(void)
          * NOTICE: This assumes consecutive processor Ids! */
         for ( uint32_t i=0; i < numHWThreads; i++ )
         {
+#ifdef __MIC
+            sprintf(msr_file_name,"/dev/msr%d",i);
+            if (access(msr_file_name, F_OK))
+            {
+                sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
+            }
+#else
             sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
+#endif
             FD_MSR[i] = open(msr_file_name, O_RDWR);
 
             if ( FD_MSR[i] < 0 )
             {
                 syslog(LOG_ERR, "Failed to open device file %s.",msr_file_name);
+                FD_MSR[i] = -2;
             }
         }
 
@@ -608,7 +761,7 @@ int main(void)
         {
             for (int j=0; j<MAX_NUM_NODES; j++)
             {
-            	socket_bus[j] = "N-A";
+                socket_bus[j] = "N-A";
                 for (int i=0; i<MAX_NUM_DEVICES; i++)
                 {
                     FD_PCI[j][i] = -2;
@@ -661,25 +814,25 @@ int main(void)
             }
             else
             {
-		        socket_count = cntr;
-            
-		        for (int j=0; j<socket_count; j++)
-				{
-					for (int i=0; i<MAX_NUM_DEVICES; i++)
-					{
-						sprintf(pci_filepath, "%s%s%s",PCI_ROOT_PATH,socket_bus[j],pci_DevicePath[i]);
-
-						if (!access(pci_filepath,F_OK))
-						{
-							FD_PCI[j][i] = 0;
-						}
-						else
-						{
-							syslog(LOG_NOTICE, "Device %s not found, excluded it from device list\n",pci_filepath);
-						}
-					}
-				}
-			}
+                socket_count = cntr;
+
+                for (int j=0; j<socket_count; j++)
+                {
+                    for (int i=0; i<MAX_NUM_DEVICES; i++)
+                    {
+                        sprintf(pci_filepath, "%s%s%s",PCI_ROOT_PATH,socket_bus[j],pci_DevicePath[i]);
+
+                        if (!access(pci_filepath,F_OK))
+                        {
+                            FD_PCI[j][i] = 0;
+                        }
+                        else
+                        {
+                            syslog(LOG_NOTICE, "Device %s not found, excluded it from device list\n",pci_filepath);
+                        }
+                    }
+                }
+            }
         }
     }
 
diff --git a/src/access-daemon/setFreq.c b/src/access-daemon/setFreq.c
index e23335c..967dbbf 100644
--- a/src/access-daemon/setFreq.c
+++ b/src/access-daemon/setFreq.c
@@ -1,36 +1,101 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  setFreq.c
+ *
+ *      Description:  Wrapper for accessing setfreq kernel FS files
+ *
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
+ *
+ *      Authors:  Michael Meier, michael.meier at rrze.fau.de
+ *                Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2014 Jan Treibig
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
 #include <stdlib.h>
 #include <stdio.h>
-
 #include <string.h>
 
+static int get_numCPUs()
+{
+    int cpucount = 0;
+    char line[1024];
+    FILE* fp = fopen("/proc/cpuinfo","r");
+    if (fp != NULL)
+    {
+        while( fgets(line,1024,fp) )
+        {
+            if (strncmp(line, "processor", 9) == 0)
+            {
+                cpucount++;
+            }
+        }
+    }
+    return cpucount;
+}
+
 int main (int argn, char** argv)
 {
     int cpuid;
     int freq;
+    int numCPUs = 0;
     char* gov;
     char* gpath = malloc(100);
     char* fpath = malloc(100);
+    FILE* f;
 
-    if (argn < 3)
+    if (argn < 3 || argn > 4)
     {
         fprintf(stderr, "Usage: %s <processorID> <frequency> [<governor>] \n",argv[0]);
+        exit(EXIT_FAILURE);
     }
 
     cpuid = atoi(argv[1]);
+    numCPUs = get_numCPUs();
+    if (cpuid < 0 || cpuid > numCPUs)
+    {
+        fprintf(stderr, "CPU %d not a valid CPU ID. Range from 0 to %d.\n",cpuid,numCPUs);
+        exit(EXIT_FAILURE);
+    }
     freq  = atoi(argv[2]);
+    if (freq < 0)
+    {
+        fprintf(stderr, "Frequency must be greater than 0.\n");
+        exit(EXIT_FAILURE);
+    }
+    snprintf(gpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
+    snprintf(fpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_setspeed", cpuid);
 
     if (argn == 4)
     {
         gov = argv[3];
 
-        if ((strncmp(gov,"ondemand",12)) && (strncmp(gov,"performance",12))) {
+        if ((strncmp(gov,"ondemand",12)) && (strncmp(gov,"performance",12)))
+        {
             fprintf(stderr, "Invalid governor %s!\n",gov);
             return (EXIT_FAILURE);
         }
-        snprintf(gpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
 
-        FILE* f = fopen(gpath, "w");
-        if (f == NULL) {
+        f = fopen(gpath, "w");
+        if (f == NULL)
+        {
             fprintf(stderr, "Unable to open path for writing\n");
             return (EXIT_FAILURE);
         }
@@ -38,20 +103,21 @@ int main (int argn, char** argv)
         fclose(f);
         return(EXIT_SUCCESS);
     }
-
-    snprintf(gpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
-    snprintf(fpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_setspeed", cpuid);
-
-    FILE* f = fopen(gpath, "w");
-    if (f == NULL) {
-        fprintf(stderr, "Unable to open path for writing\n");
-        return (EXIT_FAILURE);
+    else
+    {
+        f = fopen(gpath, "w");
+        if (f == NULL)
+        {
+            fprintf(stderr, "Unable to open path for writing\n");
+            return (EXIT_FAILURE);
+        }
+        fprintf(f,"userspace");
+        fclose(f);
     }
-    fprintf(f,"userspace");
-    fclose(f);
 
     f = fopen(fpath, "w");
-    if (f == NULL) {
+    if (f == NULL)
+    {
         fprintf(stderr, "Unable to open path for writing\n");
         return (EXIT_FAILURE);
     }
diff --git a/src/access-daemon/setFreq.c.tmp b/src/access-daemon/setFreq.c.tmp
deleted file mode 100644
index e69de29..0000000
diff --git a/src/accessClient.c b/src/accessClient.c
index 4c1cd20..ba4cb59 100644
--- a/src/accessClient.c
+++ b/src/accessClient.c
@@ -5,11 +5,11 @@
  *
  *      Description:  Implementation of client to the access daemon.
  *                   Provides API to read and write values to MSR or
- *                   PCI Cfg Adresses. This module is used by the 
+ *                   PCI Cfg Adresses. This module is used by the
  *                   msr and pci modules.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -72,7 +72,7 @@ static char* accessClient_strerror(AccessErrorType det)
         case ERR_DAEMONBUSY: return "daemon already has a same/higher priority client";
         case ERR_LOCKED:     return "access to HPM is locked";
         case ERR_UNSUPPORTED: return "unsupported processor";
-        case ERR_NODEV: 	 return "no such device";
+        case ERR_NODEV:      return "no such device";
         default:             return "UNKNOWN errorcode";
     }
 }
@@ -93,25 +93,25 @@ static int startDaemon(void)
 
     if (accessClient_mode == DAEMON_AM_ACCESS_D)
     {
-    	if (access(exeprog, F_OK))
-    	{
-    		fprintf(stderr, "Daemon '%s' cannot be found\n", exeprog);
-    		exit(EXIT_FAILURE);
-    	}
-    	if (access(exeprog, X_OK))
-    	{
-    		fprintf(stderr, "Daemon '%s' not executable\n", exeprog);
-    		exit(EXIT_FAILURE);
-    	}
+        if (access(exeprog, F_OK))
+        {
+            fprintf(stderr, "Daemon '%s' cannot be found\n", exeprog);
+            exit(EXIT_FAILURE);
+        }
+        if (access(exeprog, X_OK))
+        {
+            fprintf(stderr, "Daemon '%s' not executable\n", exeprog);
+            exit(EXIT_FAILURE);
+        }
         pid = fork();
 
         if (pid == 0)
-		{
-			ret = execve (exeprog, newargv, newenv);
-			ERRNO_PRINT;
-			fprintf(stderr, "Failed to execute the daemon '%s' (see error above)\n", exeprog);
-			exit(EXIT_FAILURE);
-		}
+        {
+            ret = execve (exeprog, newargv, newenv);
+            ERRNO_PRINT;
+            fprintf(stderr, "Failed to execute the daemon '%s' (see error above)\n", exeprog);
+            exit(EXIT_FAILURE);
+        }
         else if (pid < 0)
         {
             ERROR_PLAIN_PRINT(Failed to fork);
@@ -215,9 +215,9 @@ uint64_t accessClient_read(
     if (data.errorcode != ERR_NOERROR)
     {
         fprintf(stderr, "Failed to read data through daemon: "
-                "daemon returned error %d '%s' for cpu %d reg %x\n",
+                "daemon returned error %d '%s' for cpu %d reg 0x%x\n",
                 data.errorcode, accessClient_strerror(data.errorcode), cpu, reg);
-        exit(EXIT_FAILURE);
+        //exit(EXIT_FAILURE);
     }
 
     return data.data;
@@ -245,7 +245,7 @@ void accessClient_write(
         fprintf(stderr, "Failed to write data through daemon: "
                 "daemon returned error %d '%s' for cpu %d reg 0x%x\n",
                 data.errorcode, accessClient_strerror(data.errorcode), cpu, reg);
-        exit(EXIT_FAILURE);
+        //exit(EXIT_FAILURE);
     }
 
     if (data.data != 0x00ULL)
diff --git a/src/affinity.c b/src/affinity.c
index 3b5f508..59b05da 100644
--- a/src/affinity.c
+++ b/src/affinity.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Implementation of affinity module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -33,6 +33,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <math.h>
 #include <sys/types.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
@@ -88,52 +89,50 @@ treeFillNextEntries(
     int offset,
     int numberOfEntries )
 {
-  int counter = numberOfEntries;
-  TreeNode* node = tree;
-  TreeNode* thread;
+    int counter = numberOfEntries;
+    TreeNode* node = tree;
+    TreeNode* thread;
+    node = tree_getChildNode(node);
 
-  node = tree_getChildNode(node);
-
-  /* get socket node */
-  for (int i=0; i<socketId; i++)
-  {
-    node = tree_getNextNode(node);
-
-    if ( node == NULL )
+    /* get socket node */
+    for (int i=0; i<socketId; i++)
     {
-      printf("ERROR: Socket %d not existing!",i);
-      exit(EXIT_FAILURE);
-    }
-  }
+        node = tree_getNextNode(node);
 
-  node = tree_getChildNode(node);
-  /* skip offset cores */
-  for (int i=0; i<offset; i++)
-  {
-    node = tree_getNextNode(node);
+        if ( node == NULL )
+        {
+          printf("ERROR: Socket %d not existing!",i);
+          exit(EXIT_FAILURE);
+        }
+    }
 
-    if ( node == NULL )
+    node = tree_getChildNode(node);
+    /* skip offset cores */
+    for (int i=0; i<offset; i++)
     {
-      printf("ERROR: Core %d not existing!",i);
-      exit(EXIT_FAILURE);
-    }
-  }
+        node = tree_getNextNode(node);
 
-  /* Traverse horizontal */
-  while ( node != NULL )
-  {
-    if ( !counter ) break;
+        if ( node == NULL )
+        {
+          printf("ERROR: Core %d on socket %d not existing!",i,socketId);
+          exit(EXIT_FAILURE);
+        }
+    }
+    /* Traverse horizontal */
+    while ( node != NULL )
+    {
+        if ( !counter ) break;
 
-    thread = tree_getChildNode(node);
+        thread = tree_getChildNode(node);
 
-    while ( thread != NULL )
-    {
-      processorIds[numberOfEntries-counter] = thread->id;
-      thread = tree_getNextNode(thread);
-      counter--;
+        while ( thread != NULL )
+        {
+            processorIds[numberOfEntries-counter] = thread->id;
+            thread = tree_getNextNode(thread);
+            counter--;
+        }
+        node = tree_getNextNode(node);
     }
-    node = tree_getNextNode(node);
-  }
 }
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
@@ -166,9 +165,20 @@ affinity_init()
         (cpuid_topology.numCoresPerSocket/numberOfCoresPerCache);
 
     /* determine total number of domains */
-    numberOfDomains += numberOfSocketDomains + numberOfCacheDomains + numberOfNumaDomains;
-
+    if ( numberOfNumaDomains > 1 )
+    {
+        numberOfDomains += numberOfSocketDomains + numberOfCacheDomains + numberOfNumaDomains;
+    }
+    else
+    {
+        numberOfDomains += numberOfSocketDomains + numberOfCacheDomains;
+    }
     domains = (AffinityDomain*) malloc(numberOfDomains * sizeof(AffinityDomain));
+    if (!domains)
+    {
+        fprintf(stderr, "Cannot allocate affinity domain memory\n");
+        return;
+    }
 
     /* Node domain */
     domains[0].numberOfProcessors = cpuid_topology.numHWThreads;
@@ -228,37 +238,40 @@ affinity_init()
       }
     }
 
-    /* Memory domains */
-    currentDomain += numberOfCacheDomains;
-    subCounter = 0;
-
-    for (int i=0; i < numberOfSocketDomains; i++ )
+    if ( numberOfNumaDomains > 1 )
     {
-      offset = 0;
-
-      for ( int j=0; j < (numberOfNumaDomains/numberOfSocketDomains); j++ )
-      {
-        domains[currentDomain + subCounter].numberOfProcessors = numberOfProcessorsPerCache;
-        domains[currentDomain + subCounter].numberOfCores =  numberOfCoresPerCache;
-        domains[currentDomain + subCounter].processorList = (int*) malloc(numa_info.nodes[subCounter].numberOfProcessors*sizeof(int));
-        domains[currentDomain + subCounter].tag = bformat("M%d", subCounter);
-
-        treeFillNextEntries(
-            cpuid_topology.topologyTree,
-            domains[currentDomain + subCounter].processorList,
-            i, offset, domains[currentDomain + subCounter].numberOfProcessors);
+        /* Memory domains */
+        currentDomain += numberOfCacheDomains;
+        subCounter = 0;
 
-        offset += numberOfCoresPerCache;
-        subCounter++;
-      }
-    }
+        for (int i=0; i < numberOfSocketDomains; i++ )
+        {
+            offset = 0;
+            for ( int j=0; j < (int)ceil((double)numberOfNumaDomains/numberOfSocketDomains); j++ )
+            {
+                domains[currentDomain + subCounter].numberOfProcessors = numa_info.nodes[subCounter].numberOfProcessors;
+                domains[currentDomain + subCounter].numberOfCores =  numberOfCoresPerCache;
+                domains[currentDomain + subCounter].processorList = (int*) malloc(numa_info.nodes[subCounter].numberOfProcessors*sizeof(int));
+                domains[currentDomain + subCounter].tag = bformat("M%d", subCounter);
+
+                treeFillNextEntries(
+                        cpuid_topology.topologyTree,
+                        domains[currentDomain + subCounter].processorList,
+                        i, offset, domains[currentDomain + subCounter].numberOfProcessors);
+
+                offset += domains[currentDomain + subCounter].numberOfCores;
+
+                subCounter++;
+            }
+        }
 
-    /* This is redundant ;-). Create thread to node lookup */
-    for ( uint32_t i = 0; i < numa_info.numberOfNodes; i++ )
-    {
-        for ( int j = 0; j < numa_info.nodes[i].numberOfProcessors; j++ )
+        /* This is redundant ;-). Create thread to node lookup */
+        for ( uint32_t i = 0; i < numa_info.numberOfNodes; i++ )
         {
-            affinity_core2node_lookup[numa_info.nodes[i].processors[j]] = i;
+            for ( int j = 0; j < numa_info.nodes[i].numberOfProcessors; j++ )
+            {
+                affinity_core2node_lookup[numa_info.nodes[i].processors[j]] = i;
+            }
         }
     }
 
@@ -308,7 +321,7 @@ affinity_threadGetProcessorId()
 void
 affinity_pinThread(int processorId)
 {
-	cpu_set_t cpuset;
+    cpu_set_t cpuset;
     pthread_t thread;
 
     thread = pthread_self();
@@ -327,11 +340,11 @@ affinity_pinThread(int processorId)
 void
 affinity_pinProcess(int processorId)
 {
-	cpu_set_t cpuset;
+    cpu_set_t cpuset;
 
-	CPU_ZERO(&cpuset);
-	CPU_SET(processorId, &cpuset);
-	sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+    CPU_ZERO(&cpuset);
+    CPU_SET(processorId, &cpuset);
+    sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
 }
 
 
@@ -351,18 +364,22 @@ affinity_getDomain(bstring domain)
 }
 
 void
-affinity_printDomains()
+affinity_printDomains(FILE* OUTSTREAM)
 {
-    for ( int i=0; i < affinity_numberOfDomains; i++ )
+    if (OUTSTREAM)
     {
-        printf("Domain %d:\n",i);
-        printf("\tTag %s:",bdata(domains[i].tag));
-
-        for ( uint32_t j=0; j < domains[i].numberOfProcessors; j++ )
+        for ( int i=0; i < affinity_numberOfDomains; i++ )
         {
-            printf(" %d",domains[i].processorList[j]);
+            fprintf(OUTSTREAM, "Domain %d:\n", i);
+            fprintf(OUTSTREAM, "\tTag %s:", bdata(domains[i].tag));
+
+            for ( uint32_t j=0; j < domains[i].numberOfProcessors; j++ )
+            {
+                fprintf(OUTSTREAM, " %d", domains[i].processorList[j]);
+            }
+            fprintf(OUTSTREAM, "\n");
+            fflush(OUTSTREAM);
         }
-        printf("\n");
     }
 }
 
diff --git a/src/allocator.c b/src/allocator.c
index 811cc1c..83e8164 100644
--- a/src/allocator.c
+++ b/src/allocator.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Implementation of allocator module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -76,6 +76,7 @@ allocator_finalize()
 
 void
 allocator_allocateVector(
+        FILE* OUTSTREAM,
         void** ptr,
         int alignment,
         uint64_t size,
@@ -90,9 +91,11 @@ allocator_allocateVector(
     switch ( type )
     {
         case SINGLE:
+        case SINGLE_RAND:
             bytesize = (size+offset) * sizeof(float);
             break;
 
+        case DOUBLE_RAND:
         case DOUBLE:
             bytesize = (size+offset) * sizeof(double);
             break;
@@ -128,10 +131,13 @@ allocator_allocateVector(
     domain = affinity_getDomain(domainString);
     affinity_pinProcess(domain->processorList[0]);
 
-    printf("Allocate: Process running on core %d - Vector length %llu Offset %d\n",
+    if (OUTSTREAM)
+    {
+        fprintf(OUTSTREAM, "Allocate: Process running on core %d - Vector length %llu Offset %d\n",
             affinity_processGetProcessorId(),
             LLU_CAST size,
             offset);
+    }
 
     switch ( type )
     {
@@ -142,7 +148,7 @@ allocator_allocateVector(
 
                 for ( uint64_t i=0; i < size; i++ )
                 {
-                    sptr[i] = 0.0;
+                    sptr[i] = 1.0;
                 }
                 *ptr = (void*) sptr;
 
@@ -156,11 +162,38 @@ allocator_allocateVector(
 
                 for ( uint64_t i=0; i < size; i++ )
                 {
-                    dptr[i] = 0.0;
+                    dptr[i] = 1.0;
+                }
+                *ptr = (void*) dptr;
+            }
+            break;
+        case SINGLE_RAND:
+            {
+                srand((uint64_t)ptr);
+                float* sptr = (float*) (*ptr);
+                sptr += offset;
+
+                for ( uint64_t i=0; i < size; i++ )
+                {
+                    sptr[i] = rand()/((float)RAND_MAX)*2.0-1.0;
+                }
+                *ptr = (void*) sptr;
+            }
+            break;
+        case DOUBLE_RAND:
+            {
+                srand((uint64_t)ptr);
+                double* dptr = (double*) (*ptr);
+                dptr += offset;
+
+                for ( uint64_t i=0; i < size; i++ )
+                {
+                    dptr[i] = rand()/((double)RAND_MAX)*2.0-1.0;
                 }
                 *ptr = (void*) dptr;
             }
             break;
+        
     }
 }
 
diff --git a/src/applications/likwid-bench.c b/src/applications/likwid-bench.c
index 001874f..15f6f0d 100644
--- a/src/applications/likwid-bench.c
+++ b/src/applications/likwid-bench.c
@@ -5,8 +5,8 @@
  *
  *      Description:  A flexible and extensible benchmarking toolbox
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -49,31 +49,39 @@
 #include <allocator.h>
 
 #include <likwid.h>
+#ifdef PAPI
+#include <papi.h>
+#include <omp.h>
+#endif
 
 extern void* runTest(void* arg);
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 
 #define HELP_MSG \
-    printf("Threaded Memory Hierarchy Benchmark --  Version  %d.%d \n\n",VERSION,RELEASE); \
-printf("\n"); \
-printf("Supported Options:\n"); \
-printf("-h\t Help message\n"); \
-printf("-a\t list available benchmarks \n"); \
-printf("-p\t list available thread domains\n"); \
-printf("-l <TEST>\t list properties of benchmark \n"); \
-printf("-i <INT>\t number of iterations \n"); \
-printf("-g <INT>\t number of workgroups (mandatory)\n"); \
-printf("-t <TEST>\t type of test \n"); \
-printf("-w\t <thread_domain>:<size>[:<num_threads>[:<chunk size>:<stride>][-<streamId>:<domain_id>[:<offset>]], size in kB, MB or GB  (mandatory)\n"); \
-printf("Processors are in compact ordering. Optionally every stream can be placed. Either no stream or all streams must be placed. Multiple streams are separated by commas.\n"); \
-printf("Usage: likwid-bench -t copy -i 1000 -g 1 -w S0:100kB:10:1:2 \n"); \
-printf("\tRun 10 threads on socket 0 using physical cores only (presuming SMT2 system).\n"); \
-printf("Example with data placement: likwid-bench -t copy -i 1000 -g 1 -w S0:100kB:20-0:S1,1:S1 \n"); \
-printf("\tRun 20 threads on socket 0 and place both arrays of the copy test case on socket 1.\n")
+    fprintf(stdout, "Threaded Memory Hierarchy Benchmark --  Version  %d.%d \n\n",VERSION,RELEASE); \
+    fprintf(stdout, "\n"); \
+    fprintf(stdout, "Supported Options:\n"); \
+    fprintf(stdout, "-h\t Help message\n"); \
+    fprintf(stdout, "-v\t Version information\n"); \
+    fprintf(stdout, "-q\t Silent without output\n"); \
+    fprintf(stdout, "-a\t list available benchmarks \n"); \
+    fprintf(stdout, "-p\t list available thread domains\n"); \
+    fprintf(stdout, "-l <TEST>\t list properties of benchmark \n"); \
+    fprintf(stdout, "-i <INT>\t number of iterations \n"); \
+    fprintf(stdout, "-g <INT>\t number of workgroups (mandatory)\n"); \
+    fprintf(stdout, "-t <TEST>\t type of test \n"); \
+    fprintf(stdout, "-w\t <thread_domain>:<size>[:<num_threads>[:<chunk size>:<stride>][-<streamId>:<domain_id>[:<offset>]], size in kB, MB or GB  (mandatory)\n"); \
+    fprintf(stdout, "Processors are in compact ordering. Optionally every stream can be placed. Either no stream or all streams must be placed. Multiple streams are separated by commas.\n"); \
+    fprintf(stdout, "Usage: likwid-bench -t copy -i 1000 -g 1 -w S0:100kB:10:1:2 \n"); \
+    fprintf(stdout, "\tRun 10 threads on socket 0 using physical cores only (presuming SMT2 system).\n"); \
+    fprintf(stdout, "Example with data placement: likwid-bench -t copy -i 1000 -g 1 -w S0:100kB:20-0:S1,1:S1 \n"); \
+    fprintf(stdout, "\tRun 20 threads on socket 0 and place both arrays of the copy test case on socket 1.\n"); \
+    fflush(stdout);
 
 #define VERSION_MSG \
-    printf("likwid-bench   %d.%d \n\n",VERSION,RELEASE)
+    fprintf(stdout, "likwid-bench   %d.%d \n\n",VERSION,RELEASE); \
+    fflush(stdout);
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE  ############ */
 
@@ -116,6 +124,7 @@ int main(int argc, char** argv)
     const TestCase* test = NULL;
     Workgroup* currentWorkgroup = NULL;
     Workgroup* groups = NULL;
+    FILE* OUTSTREAM = stdout;
 
     if (cpuid_init() == EXIT_FAILURE)
     {
@@ -132,45 +141,52 @@ int main(int argc, char** argv)
         exit(EXIT_SUCCESS);
     }
     opterr = 0;
-    while ((c = getopt (argc, argv, "g:w:t:i:l:aphv")) != -1) {
+    while ((c = getopt (argc, argv, "g:w:t:i:l:aphvq")) != -1) {
         switch (c)
         {
             case 'h':
                 HELP_MSG;
                 affinity_finalize();
-				if (groups)
-				{
-					free(groups);
-				}
+                if (groups)
+                {
+                    free(groups);
+                }
                 exit (EXIT_SUCCESS);
             case 'v':
                 VERSION_MSG;
                 affinity_finalize();
                 if (groups)
                 {
-                	free(groups);
+                    free(groups);
                 }
                 exit (EXIT_SUCCESS);
             case 'a':
-                printf(TESTS"\n");
+                if (OUTSTREAM)
+                {
+                    fprintf(OUTSTREAM, TESTS"\n");
+                    fflush(OUTSTREAM);
+                }
                 affinity_finalize();
                 if (groups)
                 {
-                	free(groups);
+                    free(groups);
                 }
                 exit (EXIT_SUCCESS);
+            case 'q':
+                OUTSTREAM = NULL;
+                break;
             case 'w':
                 tmp--;
 
                 if (tmp == -1)
                 {
                     fprintf (stderr, "More workgroups configured than allocated!\n"
-                    	"Did you forget to set the number of workgroups with -g?\n");
+                        "Did you forget to set the number of workgroups with -g?\n");
                     affinity_finalize();
                     if (groups)
-		            {
-		            	free(groups);
-		            }
+                    {
+                        free(groups);
+                    }
                     return EXIT_FAILURE;
                 }
                 if (!test)
@@ -178,9 +194,9 @@ int main(int argc, char** argv)
                     fprintf (stderr, "You need to specify a test case first!\n");
                     affinity_finalize();
                     if (groups)
-		            {
-		            	free(groups);
-		            }
+                    {
+                        free(groups);
+                    }
                     return EXIT_FAILURE;
                 }
                 testcase = bfromcstr(optarg);
@@ -195,13 +211,14 @@ int main(int argc, char** argv)
                         fprintf (stderr, "Stream %d: offset is not a multiple of stride!\n",i);
                         affinity_finalize();
                         if (groups)
-				        {
-				        	free(groups);
-				        }
+                        {
+                            free(groups);
+                        }
                         return EXIT_FAILURE;
                     }
 
-                    allocator_allocateVector(&(currentWorkgroup->streams[i].ptr),
+                    allocator_allocateVector(OUTSTREAM,
+                            &(currentWorkgroup->streams[i].ptr),
                             PAGE_ALIGNMENT,
                             currentWorkgroup->size,
                             currentWorkgroup->streams[i].offset,
@@ -212,6 +229,11 @@ int main(int argc, char** argv)
                 break;
             case 'i':
                 iter =  atoi(optarg);
+                if (iter <= 0)
+                {
+                    fprintf(stderr, "Iterations must be greater than 0.\n");
+                    exit(EXIT_FAILURE);
+                }
                 break;
             case 'l':
                 testcase = bfromcstr(optarg);
@@ -227,38 +249,46 @@ int main(int argc, char** argv)
                 if (biseqcstr(testcase,"none") || !test)
                 {
                     fprintf (stderr, "Unknown test case %s\n",optarg);
-                    printf("Available test cases:\n");
-                    printf(TESTS"\n");
+                    if (OUTSTREAM)
+                    {
+                        fprintf(OUTSTREAM, "Available test cases:\n");
+                        fprintf(OUTSTREAM, TESTS"\n");
+                        fflush(OUTSTREAM);
+                    }
                     affinity_finalize();
                     if (groups)
-		            {
-		            	free(groups);
-		            }
+                    {
+                        free(groups);
+                    }
                     return EXIT_FAILURE;
                 }
                 else
                 {
-                    printf("Name: %s\n",test->name);
-                    printf("Number of streams: %d\n",test->streams);
-                    printf("Loop stride: %d\n",test->stride);
-                    printf("Flops: %d\n",test->flops);
-                    printf("Bytes: %d\n",test->bytes);
-                    switch (test->type)
+                    if (OUTSTREAM)
                     {
-                        case SINGLE:
-                            printf("Data Type: Single precision float\n");
-                            break;
-                        case DOUBLE:
-                            printf("Data Type: Double precision float\n");
-                            break;
+                        fprintf(OUTSTREAM, "Name: %s\n",test->name);
+                        fprintf(OUTSTREAM, "Number of streams: %d\n",test->streams);
+                        fprintf(OUTSTREAM, "Loop stride: %d\n",test->stride);
+                        fprintf(OUTSTREAM, "Flops: %d\n", (int) test->flops);
+                        fprintf(OUTSTREAM, "Bytes: %d\n",test->bytes);
+                        switch (test->type)
+                        {
+                            case SINGLE:
+                                fprintf(OUTSTREAM, "Data Type: Single precision float\n");
+                                break;
+                            case DOUBLE:
+                                fprintf(OUTSTREAM, "Data Type: Double precision float\n");
+                                break;
+                        }
+                        fflush(OUTSTREAM);
                     }
                 }
                 bdestroy(testcase);
                 affinity_finalize();
                 if (groups)
-		        {
-		        	free(groups);
-		        }
+                {
+                    free(groups);
+                }
                 exit (EXIT_SUCCESS);
 
                 break;
@@ -267,6 +297,11 @@ int main(int argc, char** argv)
                 break;
             case 'g':
                 numberOfWorkgroups =  atoi(optarg);
+                if (numberOfWorkgroups <= 0)
+                {
+                    fprintf(stderr, "Number of Workgroups must be 1 or greater.\n");
+                    exit(EXIT_FAILURE);
+                }
                 allocator_init(numberOfWorkgroups * MAX_STREAMS);
                 tmp = numberOfWorkgroups;
                 groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup));
@@ -287,9 +322,9 @@ int main(int argc, char** argv)
                     fprintf (stderr, "Unknown test case %s\n",optarg);
                     affinity_finalize();
                     if (groups)
-		            {
-		            	free(groups);
-		            }
+                    {
+                        free(groups);
+                    }
                     return EXIT_FAILURE;
                 }
                 bdestroy(testcase);
@@ -306,57 +341,68 @@ int main(int argc, char** argv)
                             optopt);
                 affinity_finalize();
                 if (groups)
-		        {
-		        	free(groups);
-		        }
+                {
+                    free(groups);
+                }
                 return EXIT_FAILURE;
             default:
                 HELP_MSG;
         }
     }
 
-	if (tmp > 0)
-	{
-		fprintf(stderr, "%d workgroups requested but only %d given on commandline\n",numberOfWorkgroups,numberOfWorkgroups-tmp);
-		affinity_finalize();
-		allocator_finalize();
-		if (groups)
+    if (numberOfWorkgroups == 0 && !optPrintDomains)
+    {
+        fprintf(stderr, "Number of Workgroups must be 1 or greater.\n");
+        affinity_finalize();
+        allocator_finalize();
+        if (groups)
         {
-        	free(groups);
+            free(groups);
         }
-		exit(EXIT_FAILURE);
-	}
-	if (iter <= 0)
-	{
-		fprintf(stderr,"Iterations must be greater than 0\n");
-		affinity_finalize();
-		allocator_finalize();
-		if (groups)
+        exit(EXIT_FAILURE);
+    }
+    if (tmp > 0 && iter > 0)
+    {
+        fprintf(stderr, "%d workgroups requested but only %d given on commandline\n",numberOfWorkgroups,numberOfWorkgroups-tmp);
+        affinity_finalize();
+        allocator_finalize();
+        if (groups)
         {
-        	free(groups);
+            free(groups);
         }
-		exit(EXIT_FAILURE);
-	}
-	if (test && !(currentWorkgroup || groups))
-	{
-		fprintf(stderr, "Workgroups must be set on commandline\n");
-		affinity_finalize();
-		allocator_finalize();
-		if (groups)
+        exit(EXIT_FAILURE);
+    }
+    if (iter <= 0)
+    {
+        fprintf(stderr,"Iterations must be greater than 0\n");
+        affinity_finalize();
+        allocator_finalize();
+        if (groups)
+        {
+            free(groups);
+        }
+        exit(EXIT_FAILURE);
+    }
+    if (test && !(currentWorkgroup || groups))
+    {
+        fprintf(stderr, "Workgroups must be set on commandline\n");
+        affinity_finalize();
+        allocator_finalize();
+        if (groups)
         {
-        	free(groups);
+            free(groups);
         }
-		exit(EXIT_FAILURE);
-	}
+        exit(EXIT_FAILURE);
+    }
 
     if (optPrintDomains)
     {
-        affinity_printDomains();
+        affinity_printDomains(OUTSTREAM);
         affinity_finalize();
-		allocator_finalize();
-		if (groups)
+        allocator_finalize();
+        if (groups)
         {
-        	free(groups);
+            free(groups);
         }
         exit (EXIT_SUCCESS);
     }
@@ -369,7 +415,7 @@ int main(int argc, char** argv)
         globalNumberOfThreads += groups[i].numberOfThreads;
     }
 
-    threads_init(globalNumberOfThreads);
+    threads_init(OUTSTREAM, globalNumberOfThreads);
     threads_createGroups(numberOfWorkgroups);
 
     /* we configure global barriers only */
@@ -377,9 +423,21 @@ int main(int argc, char** argv)
     barrier_registerGroup(globalNumberOfThreads);
 
 #ifdef PERFMON
-    printf("Using likwid\n");
+    if (OUTSTREAM)
+    {
+        fprintf(OUTSTREAM, "Using likwid\n");
+        fflush(OUTSTREAM);
+    }
     likwid_markerInit();
 #endif
+#ifdef PAPI
+    if (OUTSTREAM)
+    {
+        fprintf(OUTSTREAM, "Using PAPI\n");
+    }
+    PAPI_library_init (PAPI_VER_CURRENT);
+    PAPI_thread_init((unsigned long (*)(void))(omp_get_thread_num));
+#endif
 
 
     /* initialize data structures for threads */
@@ -407,61 +465,68 @@ int main(int argc, char** argv)
         free(myData.streams);
     }
 
-    printf(HLINE);
-    printf("LIKWID MICRO BENCHMARK\n");
-    printf("Test: %s\n",test->name);
-    printf(HLINE);
-    printf("Using %d work groups\n",numberOfWorkgroups);
-    printf("Using %d threads\n",globalNumberOfThreads);
-    printf(HLINE);
+    if (OUTSTREAM)
+    {
+        fprintf(OUTSTREAM, HLINE);
+        fprintf(OUTSTREAM, "LIKWID MICRO BENCHMARK\n");
+        fprintf(OUTSTREAM, "Test: %s\n",test->name);
+        fprintf(OUTSTREAM, HLINE);
+        fprintf(OUTSTREAM, "Using %d work groups\n",numberOfWorkgroups);
+        fprintf(OUTSTREAM, "Using %d threads\n",globalNumberOfThreads);
+        fprintf(OUTSTREAM, HLINE);
+        fflush(OUTSTREAM);
+    }
 
     threads_create(runTest);
     threads_join();
     allocator_finalize();
-	
-	uint32_t realSize = 0;
-	uint64_t realCycles = 0;
-	int current_id = 0;
-    
-	printf(HLINE);
-    for(j=0;j<numberOfWorkgroups;j++)
+
+    uint32_t realSize = 0;
+    uint64_t realCycles = 0;
+    int current_id = 0;
+
+    if (OUTSTREAM)
     {
-    	current_id = j*groups[j].numberOfThreads;
-    	realCycles += threads_data[current_id].cycles;
-    	realSize += groups[j].numberOfThreads * threads_data[current_id].data.size;
-	}
-	time = (double) realCycles / (double) timer_getCpuClock();
-	printf("Cycles: %llu \n", LLU_CAST realCycles);
-	printf("Iterations: %llu \n", LLU_CAST iter);
-	printf("Size %d \n",  realSize );
-	printf("Vectorlength: %llu \n", LLU_CAST threads_data[current_id].data.size);
-	printf("Time: %e sec\n", time);
-	printf("Number of Flops: %llu \n", LLU_CAST (iter * realSize *  test->flops));
-	printf("MFlops/s: %.2f\n",
-	        1.0E-06 * ((double) iter * realSize *  test->flops/  time));
-	printf("MByte/s: %.2f\n",
-	        1.0E-06 * ( (double) iter * realSize *  test->bytes/ time));
-	printf("Cycles per update: %f\n",
-	        ((double) realCycles / (double) (iter * numberOfWorkgroups * threads_data[current_id].numberOfThreads *  threads_data[current_id].data.size)));
-
-	switch ( test->type )
-	{
-	    case SINGLE:
-	        printf("Cycles per cacheline: %f\n",
-	                (16.0 * (double) realCycles / (double) (iter * realSize)));
-	        break;
-	    case DOUBLE:
-	        printf("Cycles per cacheline: %f\n",
-	                (8.0 * (double) realCycles / (double) (iter * realSize)));
-	        break;
-	}
-
-	
-    printf(HLINE);
+        fprintf(OUTSTREAM, HLINE);
+        for(j=0;j<numberOfWorkgroups;j++)
+        {
+            current_id = j*groups[j].numberOfThreads;
+            realCycles += threads_data[current_id].cycles;
+            realSize += groups[j].numberOfThreads * threads_data[current_id].data.size;
+        }
+        time = (double) realCycles / (double) timer_getCpuClock();
+        fprintf(OUTSTREAM, "Cycles: %llu \n", LLU_CAST realCycles);
+        fprintf(OUTSTREAM, "Iterations: %llu \n", LLU_CAST iter);
+        fprintf(OUTSTREAM, "Size %d \n",  realSize );
+        fprintf(OUTSTREAM, "Vectorlength: %llu \n", LLU_CAST threads_data[current_id].data.size);
+        fprintf(OUTSTREAM, "Time: %e sec\n", time);
+        fprintf(OUTSTREAM, "Number of Flops: %llu \n", LLU_CAST (iter * realSize *  test->flops));
+        fprintf(OUTSTREAM, "MFlops/s: %.2f\n",
+                1.0E-06 * ((double) iter * realSize *  test->flops/  time));
+        fprintf(OUTSTREAM, "MByte/s: %.2f\n",
+                1.0E-06 * ( (double) iter * realSize *  test->bytes/ time));
+        fprintf(OUTSTREAM, "Cycles per update: %f\n",
+                ((double) realCycles / (double) (iter * numberOfWorkgroups * threads_data[current_id].numberOfThreads *  threads_data[current_id].data.size)));
+
+        switch ( test->type )
+        {
+            case SINGLE:
+                fprintf(OUTSTREAM, "Cycles per cacheline: %f\n",
+                        (16.0 * (double) realCycles / (double) (iter * realSize)));
+                break;
+            case DOUBLE:
+                fprintf(OUTSTREAM, "Cycles per cacheline: %f\n",
+                        (8.0 * (double) realCycles / (double) (iter * realSize)));
+                break;
+        }
+
+        fprintf(OUTSTREAM, HLINE);
+        fflush(OUTSTREAM);
+    }
     threads_destroy(numberOfWorkgroups);
     barrier_destroy();
     
-	affinity_finalize();
+    affinity_finalize();
 #ifdef PERFMON
     likwid_markerClose();
 #endif
diff --git a/src/applications/likwid-features.c b/src/applications/likwid-features.c
index 679561e..6fe5477 100644
--- a/src/applications/likwid-features.c
+++ b/src/applications/likwid-features.c
@@ -6,8 +6,8 @@
  *      Description:  An application to read out and set the feature flag
  *                  register on Intel Core 2 processors.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -45,18 +45,20 @@
 #include <cpuFeatures.h>
 
 #define HELP_MSG \
-printf("\nlikwid-features --  Version  %d.%d \n\n",VERSION,RELEASE); \
-printf("A tool to print and toggle the feature flag msr on Intel CPUS.\n"); \
-printf("Supported Features: HW_PREFETCHER, CL_PREFETCHER, DCU_PREFETCHER, IP_PREFETCHER.\n\n"); \
-printf("Options:\n"); \
-printf("-h\t Help message\n"); \
-printf("-v\t Version information\n"); \
-printf("-s <FEATURE>\t set cpu feature \n"); \
-printf("-u <FEATURE>\t unset cpu feature \n"); \
-printf("-c <ID>\t core id\n\n")
+    fprintf(stdout, "\nlikwid-features --  Version  %d.%d \n\n",VERSION,RELEASE); \
+    fprintf(stdout, "A tool to print and toggle the feature flag msr on Intel CPUS.\n"); \
+    fprintf(stdout, "Supported Features: HW_PREFETCHER, CL_PREFETCHER, DCU_PREFETCHER, IP_PREFETCHER.\n\n"); \
+    fprintf(stdout, "Options:\n"); \
+    fprintf(stdout, "-h\t Help message\n"); \
+    fprintf(stdout, "-v\t Version information\n"); \
+    fprintf(stdout, "-s <FEATURE>\t set cpu feature \n"); \
+    fprintf(stdout, "-u <FEATURE>\t unset cpu feature \n"); \
+    fprintf(stdout, "-c <ID>\t core id\n\n"); \
+    fflush(stdout);
 
 #define VERSION_MSG \
-printf("likwid-features  %d.%d \n\n",VERSION,RELEASE)
+    fprintf(stdout, "likwid-features  %d.%d \n\n",VERSION,RELEASE); \
+    fflush(stdout);
 
 int main (int argc, char** argv)
 { 
@@ -80,7 +82,7 @@ int main (int argc, char** argv)
             case 'u':
                 optSetFeature = 2;
             case 's':
-                if (! (argString = bSecureInput(20,optarg)))
+                if (! (argString = bSecureInput(40,optarg)))
                 {
                     fprintf(stderr,"Failed to read argument string!\n");
                     exit(EXIT_FAILURE);
@@ -115,7 +117,7 @@ int main (int argc, char** argv)
                 }
                 break;
             case 'c':
-                if (! (argString = bSecureInput(10,optarg)))
+                if (! (argString = bSecureInput(20,optarg)))
                 {
                     fprintf(stderr,"Failed to read argument string!\n");
                     exit(EXIT_FAILURE);
@@ -147,9 +149,10 @@ int main (int argc, char** argv)
         ERROR_PLAIN_PRINT(Unsupported processor!);
     }
 
-    printf(HLINE);
-    printf("CPU name:\t%s \n",cpuid_info.name);
-    printf("CPU core id:\t%d \n", cpuId);
+    fprintf(stdout, HLINE);
+    fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name);
+    fprintf(stdout, "CPU core id:\t%d \n", cpuId);
+    fflush(stdout);
 
     if (cpuid_info.family != P6_FAMILY)
     {
@@ -165,21 +168,22 @@ int main (int argc, char** argv)
 
     accessClient_init(&socket_fd);
     msr_init(socket_fd);
-	cpuFeatures_init(cpuId);
+    cpuFeatures_init(cpuId);
     cpuFeatures_print(cpuId);
 
     if (optSetFeature == 1)
     {
-        printf(SLINE);
+        fprintf(stdout, SLINE);
         cpuFeatures_enable(cpuId, feature);
-        printf(SLINE);
+        fprintf(stdout, SLINE);
     }
     else if (optSetFeature == 2)
     {
-        printf(SLINE);
+        fprintf(stdout, SLINE);
         cpuFeatures_disable(cpuId, feature);
-        printf(SLINE);
+        fprintf(stdout, SLINE);
     }
+    fflush(stdout);
 
     msr_finalize();
     return EXIT_SUCCESS;
diff --git a/src/applications/likwid-genCfg.c b/src/applications/likwid-genCfg.c
index c8d3216..97147fd 100644
--- a/src/applications/likwid-genCfg.c
+++ b/src/applications/likwid-genCfg.c
@@ -6,8 +6,8 @@
  *      Description:  An application to dump the cpu topology information to
  *      a config file.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -43,21 +43,24 @@
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 
 #define HELP_MSG \
-printf("\nlikwid-genCfg --  Version  %d.%d \n\n",VERSION,RELEASE); \
-printf("A tool to dump node topology information into a file.\n"); \
-printf("Options:\n"); \
-printf("-h\t Help message\n"); \
-printf("-v\t Version information\n"); \
-printf("-o\t output file path (optional)\n\n");
+    fprintf(stdout, "\nlikwid-genCfg --  Version  %d.%d \n\n",VERSION,RELEASE); \
+    fprintf(stdout, "A tool to dump node topology information into a file.\n"); \
+    fprintf(stdout, "Options:\n"); \
+    fprintf(stdout, "-h\t Help message\n"); \
+    fprintf(stdout, "-v\t Version information\n"); \
+    fprintf(stdout, "-o\t output file path (optional)\n\n"); \
+    fflush(stdout);
 
 #define VERSION_MSG \
-printf("likwid-powermeter  %d.%d \n\n",VERSION,RELEASE)
+    fprintf(stdout, "likwid-genCfg  %d.%d \n\n",VERSION,RELEASE); \
+    fflush(stdout);
 
 
 int main (int argc, char** argv)
 {
     FILE *file;
     char *filepath = TOSTRING(CFGFILE);
+    size_t size;
     int c;
 
     while ((c = getopt (argc, argv, "ho:v")) != -1)
@@ -92,17 +95,18 @@ int main (int argc, char** argv)
     }
 
     cpuid_init();
-    printf(HLINE);
-    printf("CPU name:\t%s \n",cpuid_info.name);
+    fprintf(stdout, HLINE);
+    fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name);
+    fflush(stdout);
 
     if ((file = fopen(filepath, "wb")) != NULL) 
     {
-        (void) fwrite((void*) &cpuid_topology, sizeof(CpuTopology), 1, file);
+        size = fwrite((void*) &cpuid_topology, sizeof(CpuTopology), 1, file);
 
-        (void) fwrite((void*) cpuid_topology.threadPool,
+        size = fwrite((void*) cpuid_topology.threadPool,
                 sizeof(HWThread), cpuid_topology.numHWThreads, file);
 
-        (void) fwrite((void*) cpuid_topology.cacheLevels,
+        size = fwrite((void*) cpuid_topology.cacheLevels,
                 sizeof(CacheLevel), cpuid_topology.numCacheLevels, file);
 
         fclose(file);
diff --git a/src/applications/likwid-memsweeper.c b/src/applications/likwid-memsweeper.c
index 925aa79..4806763 100644
--- a/src/applications/likwid-memsweeper.c
+++ b/src/applications/likwid-memsweeper.c
@@ -5,8 +5,8 @@
  *
  *      Description:  An application to clean up NUMA memory domains.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -46,26 +46,32 @@
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 
 #define HELP_MSG \
-printf("\nlikwid-memsweeper --  Version  %d.%d \n\n",VERSION,RELEASE); \
-printf("A tool clean up NUMA memory domains and last level caches.\n"); \
-printf("Options:\n"); \
-printf("-h\t Help message\n"); \
-printf("-v\t Version information\n"); \
-printf("-c\t specify NUMA domain ID to clean up\n"); \
-printf("Usage: likwid-memsweeper \n"); \
-printf("To clean specific domain: likwid-memsweeper -c 2 \n");
+    fprintf(stdout, "\nlikwid-memsweeper --  Version  %d.%d \n\n",VERSION,RELEASE); \
+    fprintf(stdout, "A tool clean up NUMA memory domains and last level caches.\n"); \
+    fprintf(stdout, "Options:\n"); \
+    fprintf(stdout, "-h\t Help message\n"); \
+    fprintf(stdout, "-v\t Version information\n"); \
+    fprintf(stdout, "-q\t Silent without output\n"); \
+    fprintf(stdout, "-c\t Specify NUMA domain ID to clean up\n"); \
+    fprintf(stdout, "\t If no specific domain is set, all domains are swept.\n"); \
+    fprintf(stdout, "Usage:\n"); \
+    fprintf(stdout, "To clean specific domain: likwid-memsweeper -c 2 \n"); \
+    fflush(stdout);
 
 #define VERSION_MSG \
-printf("likwid-memsweeper  %d.%d \n\n",VERSION,RELEASE)
+    fprintf(stdout, "likwid-memsweeper  %d.%d \n\n",VERSION,RELEASE); \
+    fflush(stdout);
 
 
 int main (int argc, char** argv)
 {
     int domainId = -1;
     int c;
+    int optSilent = 0;
     bstring argString;
+    FILE* OUTSTREAM = stdout;
 
-    while ((c = getopt (argc, argv, "+c:hv")) != -1)
+    while ((c = getopt (argc, argv, "+c:hvq")) != -1)
     {
         switch (c)
         {
@@ -75,6 +81,10 @@ int main (int argc, char** argv)
             case 'v':
                 VERSION_MSG;
                 exit (EXIT_SUCCESS);
+            case 'q':
+                optSilent = 1;
+                OUTSTREAM = NULL;
+                break;
             case 'c':
                 if (! (argString = bSecureInput(10,optarg)))
                 {
@@ -111,11 +121,16 @@ int main (int argc, char** argv)
 
     if (domainId < 0) 
     {
-        memsweep_node();
+        memsweep_node(OUTSTREAM);
+    }
+    else if (domainId < numa_info.numberOfNodes)
+    {
+        memsweep_domain(OUTSTREAM, domainId);
     }
     else
     {
-        memsweep_domain(domainId);
+        fprintf(stderr, "Unknown NUMA domain %d\n", domainId);
+        exit(EXIT_FAILURE);
     }
 
     return EXIT_SUCCESS;
diff --git a/src/applications/likwid-perfctr.c b/src/applications/likwid-perfctr.c
index fce52ea..6c9f98f 100644
--- a/src/applications/likwid-perfctr.c
+++ b/src/applications/likwid-perfctr.c
@@ -6,8 +6,8 @@
  *      Description:  An application to read out performance counter registers
  *                  on x86 processors
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -58,31 +58,35 @@
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 #define HELP_MSG \
-printf("likwid-perfctr --  Version  %d.%d \n\n",VERSION,RELEASE); \
-printf("\n"); \
-printf("Example Usage: likwid-perfctr -C 2  ./a.out \n"); \
-printf("Supported Options:\n"); \
-printf("-h\t Help message\n"); \
-printf("-v\t Version information\n"); \
-printf("-V\t verbose output\n"); \
-printf("-g\t performance group or event set string\n"); \
-printf("-H\t Get group help (together with -g switch) \n"); \
-printf("-t\t timeline mode with frequency in s or ms, e.g. 300ms\n"); \
-printf("-S\t stethoscope mode with duration in s\n"); \
-printf("-m\t use markers inside code \n"); \
-printf("-s\t bitmask with threads to skip\n"); \
-printf("-o\t Store output to file, with output conversation according to file suffix\n"); \
-printf("\t Conversation scripts can be supplied in %s\n",TOSTRING(LIKWIDFILTERPATH)); \
-printf("-O\t Output easily parseable CSV instead of fancy tables\n"); \
-printf("-M\t set how MSR registers are accessed: 0=direct, 1=msrd\n"); \
-printf("-a\t list available performance groups\n"); \
-printf("-e\t list available counters and events\n"); \
-printf("-i\t print cpu info\n"); \
-printf("-c\t processor ids to measure (required), e.g. 1,2-4,8\n"); \
-printf("-C\t processor ids to measure (this variant also cares for pinning of process/threads), e.g. 1,2-4,8\n");
+fprintf(stdout, "likwid-perfctr --  Version  %d.%d \n\n",VERSION,RELEASE); \
+fprintf(stdout, "\n"); \
+fprintf(stdout, "Example Usage: likwid-perfctr -C 2  ./a.out \n"); \
+fprintf(stdout, "Supported Options:\n"); \
+fprintf(stdout, "-h\t Help message\n"); \
+fprintf(stdout, "-v\t Version information\n"); \
+fprintf(stdout, "-V\t verbose output\n"); \
+fprintf(stdout, "-g\t performance group or event set string\n"); \
+fprintf(stdout, "-H\t Get group help (together with -g switch) \n"); \
+fprintf(stdout, "-t\t timeline mode with frequency in s or ms, e.g. 300ms\n"); \
+fprintf(stdout, "-S\t stethoscope mode with duration in s\n"); \
+fprintf(stdout, "-m\t use markers inside code \n"); \
+fprintf(stdout, "-s\t bitmask with threads to skip\n"); \
+fprintf(stdout, "-o\t Store output to file, with output conversation according to file suffix\n"); \
+fprintf(stdout, "\t Conversation scripts can be supplied in %s\n",TOSTRING(LIKWIDFILTERPATH)); \
+fprintf(stdout, "-O\t Output easily parseable CSV instead of fancy tables\n"); \
+fprintf(stdout, "-M\t set how MSR registers are accessed: 0=direct, 1=msrd\n"); \
+fprintf(stdout, "-a\t list available performance groups\n"); \
+fprintf(stdout, "-e\t list available counters and events\n"); \
+fprintf(stdout, "-i\t print cpu info\n"); \
+fprintf(stdout, "-c\t processor ids to measure (required), e.g 0,3-4,8\n"); \
+fprintf(stdout, "-C\t processor ids to measure (this variant also cares for pinning of process/threads)\n"); \
+fprintf(stdout, "\t\t for -c and -C, see likwid-pin -h for details\n"); \
+fflush(stdout);
+
 
 #define VERSION_MSG \
-printf("likwid-perfctr  %d.%d \n\n",VERSION,RELEASE);
+fprintf(stdout, "likwid-perfctr  %d.%d \n\n",VERSION,RELEASE); \
+fflush(stdout);
 
 /* To be able to give useful error messages instead of just dieing without a
  * comment. Mainly happens because we get a SIGPIPE if the daemon drops us. */
@@ -157,7 +161,6 @@ int main (int argc, char** argv)
             case 'c':
                 CHECK_OPTION_STRING;
                 numThreads = bstr_to_cpuset(threads, argString);
-
                 if(!numThreads)
                 {
                     ERROR_PLAIN_PRINT(Failed to parse cpu list.);
@@ -165,7 +168,8 @@ int main (int argc, char** argv)
 
                 break;
             case 'd':
-                printf("Option -d for daemon mode is deprecated. Daemon mode has be renamed to timeline mode (Option -t)!\n");
+                fprintf(stdout, "Option -d for daemon mode is deprecated. Daemon mode has be renamed to timeline mode (Option -t)!\n");
+                fflush(stdout);
                 break;
             case 'e':
                 numThreads=1; /*to get over the error message */
@@ -219,6 +223,12 @@ int main (int argc, char** argv)
             case 'S':
                 CHECK_OPTION_STRING;
                 optStethoscope = str2int((char*) argString->data);
+                if (optStethoscope <= 0)
+                {
+                    fprintf(stderr, "The measurement time must be larger than 0\n\n");
+                    HELP_MSG;
+                    exit(EXIT_FAILURE);
+                }
                 break;
             case 't':
                 CHECK_OPTION_STRING;
@@ -234,11 +244,11 @@ int main (int argc, char** argv)
                 perfmon_verbose = 1;
                 break;
             case '?':
-            	if (optopt == 'S'||optopt == 't'||optopt == 'c'||optopt == 'C'||
-            		optopt == 'o'||optopt == 'M'||optopt == 'g')
-            	{
-            	
-            	}
+                if (optopt == 'S'||optopt == 't'||optopt == 'c'||optopt == 'C'||
+                    optopt == 'o'||optopt == 'M'||optopt == 'g')
+                {
+
+                }
                 else if (isprint (optopt))
                 {
                     fprintf (stderr, "Unknown option `-%c'.\n", optopt);
@@ -261,6 +271,7 @@ int main (int argc, char** argv)
     if (!numThreads)
     {
         fprintf (stderr, "ERROR: Required -c. You must specify at least one processor.\n");
+        HELP_MSG;
         exit(EXIT_FAILURE);
     }
 
@@ -285,12 +296,12 @@ int main (int argc, char** argv)
             }
 
             bformata(pinString,",%d",threads[0]);
-			
-			if (skipMask > 0)
-			{
-            	skipString = bformat("%d",skipMask);
-				setenv("LIKWID_SKIP",(char*) skipString->data , 1);
-			}
+
+            if (skipMask > 0)
+            {
+                skipString = bformat("%d",skipMask);
+                setenv("LIKWID_SKIP",(char*) skipString->data , 1);
+            }
             setenv("KMP_AFFINITY", "disabled", 1);
             setenv("LIKWID_PIN",(char*) pinString->data , 1);
 
@@ -317,7 +328,12 @@ int main (int argc, char** argv)
         {
             if(i != j && threads[i] == threads[j])
             {
-                fprintf (stderr, "ERROR: Processor list is not unique.\n");
+                fprintf (stderr, "ERROR: Processor list (%d",threads[0]);
+                for (c=1;c<numThreads;c++)
+                {
+                    fprintf (stderr, ",%d",threads[c]);
+                }
+                fprintf (stderr, ") is not unique.\n");
                 exit(EXIT_FAILURE);
             }
         }
@@ -340,7 +356,7 @@ int main (int argc, char** argv)
         fprintf(OUTSTREAM,"CPU stepping:\t%u \n", cpuid_info.stepping);
         fprintf(OUTSTREAM,"CPU features:\t%s \n", cpuid_info.features);
 
-        if( cpuid_info.family == P6_FAMILY && cpuid_info.perf_version) 
+        if( cpuid_info.family == P6_FAMILY && cpuid_info.perf_version)
         {
             fprintf(OUTSTREAM,HLINE);
             fprintf(OUTSTREAM,"PERFMON version:\t%u \n",cpuid_info.perf_version);
@@ -350,6 +366,7 @@ int main (int argc, char** argv)
         }
     }
     fprintf(OUTSTREAM,HLINE);
+    fflush(OUTSTREAM);
 
     if (optInfo)
     {
@@ -371,11 +388,24 @@ int main (int argc, char** argv)
         perfmon_printEvents();
         exit (EXIT_SUCCESS);
     }
-    if ((!optTimeline && !optStethoscope) && (optind == argc)) 
+    if ((!optTimeline && !optStethoscope) && (optind == argc))
     {
         fprintf(OUTSTREAM,"NOTICE: You have to specify a program to measure as argument!\n");
         exit (EXIT_SUCCESS);
     }
+    argv +=  optind;
+    bstring exeString = bfromcstr(argv[0]);
+    for (i=1; i<(argc-optind); i++)
+        {
+            bconchar(exeString, ' ');
+            bcatcstr(exeString, argv[i]);
+        }
+    if (blength(exeString) == 0 && !optStethoscope)
+    {
+        fprintf(OUTSTREAM, "Executable must be given on commandline\n");
+        fflush(OUTSTREAM);
+        exit(EXIT_FAILURE);
+    }
     if (biseqcstr(eventString,"_NOGROUP"))
     {
         fprintf(OUTSTREAM,"NOTICE: You have to specify a group or event set to measure using the -g option.\n");
@@ -388,11 +418,20 @@ int main (int argc, char** argv)
     fprintf(OUTSTREAM,HLINE);
     fprintf(OUTSTREAM,"CPU type:\t%s \n",cpuid_info.name);
     fprintf(OUTSTREAM,"CPU clock:\t%3.2f GHz \n",  (float) timer_getCpuClock() * 1.E-09);
+    fflush(OUTSTREAM);
 
-    perfmon_setupEventSet(eventString, &counterMask);
     fprintf(OUTSTREAM,HLINE);
+    fflush(OUTSTREAM);
 
-    if (optTimeline)
+    if (optStethoscope)
+    {
+        perfmon_setupEventSet(eventString, &counterMask);
+        perfmon_startCounters();
+        sleep(optStethoscope);
+        perfmon_stopCounters();
+        perfmon_printCounterResults();
+    }
+    else if (optTimeline)
     {
         fprintf(OUTSTREAM,"CORES: %d", threads[0]);
         for (int i=1; i<numThreads; i++)
@@ -400,43 +439,38 @@ int main (int argc, char** argv)
             fprintf(OUTSTREAM," %d", threads[i]);
         }
         fprintf(OUTSTREAM," \n");
+        fflush(OUTSTREAM);
 
-        daemon_init(eventString);
-        daemon_start(interval);
-    }
-
-    argv +=  optind;
-    bstring exeString = bfromcstr(argv[0]);
-
-    if (optStethoscope)
-    {
-        perfmon_startCounters();
-        sleep(optStethoscope);
-        perfmon_stopCounters();
-        perfmon_printCounterResults();
+        daemon_start(eventString, interval);
+        if (system(bdata(exeString)) == EOF)
+        {
+            fprintf(stderr, "Failed to execute %s!\n", bdata(exeString));
+            exit(EXIT_FAILURE);
+        }
+        daemon_stop(SIGINT);
     }
     else
     {
-        for (i=1; i<(argc-optind); i++)
+        if (perfmon_verbose)
         {
-            bconchar(exeString, ' ');
-            bcatcstr(exeString, argv[i]);
+            fprintf(OUTSTREAM,"Executing: %s \n",bdata(exeString));
+            fflush(OUTSTREAM);
         }
-        if (perfmon_verbose) fprintf(OUTSTREAM,"Executing: %s \n",bdata(exeString));
 
         if (optReport)
         {
             //        multiplex_start();
         }
-        else if (!optUseMarker)
+        else if (!optUseMarker && !optTimeline)
         {
+            perfmon_setupEventSet(eventString, &counterMask);
             perfmon_startCounters();
         }
         else
         {
             if (getenv("LIKWID_FILEPATH") == NULL)
                 setenv("LIKWID_FILEPATH",(char*) filepath->data, 1);
-
+            perfmon_setupEventSet(eventString, &counterMask);
             char* modeStr = (char*) malloc(40 * sizeof(char));
             sprintf(modeStr,"%d",accessClient_mode);
             setenv("LIKWID_MODE", modeStr, 1);
@@ -447,8 +481,6 @@ int main (int argc, char** argv)
             perfmon_startCounters();
         }
 
-        fprintf(OUTSTREAM,"%s\n",bdata(exeString));
-
         if (system(bdata(exeString)) == EOF)
         {
             fprintf(stderr, "Failed to execute %s!\n", bdata(exeString));
@@ -462,15 +494,15 @@ int main (int argc, char** argv)
         }
         else
         {
-            if (!optUseMarker)
+            if (optUseMarker)
             {
                 perfmon_stopCounters();
-                perfmon_printCounterResults();
+                perfmon_printMarkerResults(filepath);
             }
             else
             {
                 perfmon_stopCounters();
-                perfmon_printMarkerResults(filepath);
+                perfmon_printCounterResults();
             }
         }
     }
diff --git a/src/applications/likwid-pin.c b/src/applications/likwid-pin.c
index e046df0..3d9e85b 100644
--- a/src/applications/likwid-pin.c
+++ b/src/applications/likwid-pin.c
@@ -5,8 +5,8 @@
  *
  *      Description:  An application to pin a program including threads
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -52,46 +52,55 @@
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 #define HELP_MSG \
-printf("likwid-pin --  Version %d.%d \n\n",VERSION,RELEASE); \
-printf("\n"); \
-printf("Supported Options:\n"); \
-printf("-h\t Help message\n"); \
-printf("-v\t Version information\n"); \
-printf("-i\t Set numa interleave policy with all involved numa nodes\n"); \
-printf("-S\t Sweep memory in involved numa nodes\n"); \
-printf("-c\t comma separated list of processor ids or expression\n"); \
-printf("-s\t bitmask with threads to skip\n"); \
-printf("-p\t Print available domains with mapping on physical ids\n"); \
-printf("  \t If used together with -c option outputs a physical processor ids.\n"); \
-printf("-d\t Delimiter used for using -p to output physical processor list, default is comma.\n\n"); \
-printf("-q\t Silent without output\n\n"); \
-printf("There are three possibilities to provide a thread to processor list:\n\n"); \
-printf("1. Thread list with physical or logical thread numberings and physical cores first.\n"); \
-printf("Example usage thread list: likwid-pin -c N:0,4-6 ./myApp\n"); \
-printf("You can pin with the following numberings:\n");  \
-printf("\t1. Physical numbering of OS.\n");  \
-printf("\t2. Logical numbering inside node. e.g. -c N:0-3\n");  \
-printf("\t3. Logical numbering inside socket. e.g. -c S0:0-3\n");  \
-printf("\t4. Logical numbering inside last level cache group. e.g. -c C0:0-3\n");  \
-printf("\t5. Logical numbering inside NUMA domain. e.g. -c M0:0-3\n");  \
-printf("\tYou can also mix domains separated by  @, e.g. -c S0:0-3 at S1:0-3 \n\n");  \
-printf("2. Expressions based thread list generation with compact processor numbering.\n"); \
-printf("Example usage expression: likwid-pin -c E:N:8 ./myApp\n"); \
-printf("This will generate a compact list of thread to processor mapping for the node domain with eight threads.\n");  \
-printf("The following syntax variants are available:\n");  \
-printf("\t1. -c E:<thread domain>:<number of threads>\n");  \
-printf("\t2. -c E:<thread domain>:<number of threads>:<chunk size>:<stride>\n");  \
-printf("\t   For two SMT threads per core on a SMT 4 machine use e.g. -c E:N:122:2:4\n\n");  \
-printf("3. Scatter policy among thread domain type.\n"); \
-printf("Example usage scatter: likwid-pin -c M:scatter ./myApp\n"); \
-printf("This will generate a thread to processor mapping scattered among all memory domains with physical cores first.\n\n");  \
-printf("If you ommit the -c option likwid will use all processors available on the node\n"); \
-printf("with physical cores first. likwid-pin will also set OMP_NUM_THREADS with as many\n"); \
-printf("threads as specified in your pin expression if OMP_NUM_THREADS is not present\n"); \
-printf("in your environment.\n\n")
+    fprintf(stdout, "likwid-pin --  Version %d.%d \n\n",VERSION,RELEASE); \
+    fprintf(stdout, "\n"); \
+    fprintf(stdout, "Supported Options:\n"); \
+    fprintf(stdout, "-h\t Help message\n"); \
+    fprintf(stdout, "-v\t Version information\n"); \
+    fprintf(stdout, "-i\t Set numa interleave policy with all involved numa nodes\n"); \
+    fprintf(stdout, "-S\t Sweep memory in involved numa nodes\n"); \
+    fprintf(stdout, "-c\t comma separated list of processor ids or expression\n"); \
+    fprintf(stdout, "-s\t bitmask with threads to skip\n"); \
+    fprintf(stdout, "-p\t Print available domains with mapping on physical ids\n"); \
+    fprintf(stdout, "  \t If used together with -c option outputs a physical processor ids.\n"); \
+    fprintf(stdout, "-d\t Delimiter used for using -p to output physical processor list, default is comma.\n\n"); \
+    fprintf(stdout, "-q\t Silent without output\n\n"); \
+    fprintf(stdout, "There are three possibilities to provide a thread to processor list:\n\n"); \
+    fprintf(stdout, "1. Thread list with physical or logical thread numberings and physical cores first.\n"); \
+    fprintf(stdout, "Example usage thread list: likwid-pin -c N:0,4-6 ./myApp\n"); \
+    fprintf(stdout, "You can pin with the following numberings:\n");  \
+    fprintf(stdout, "\t1. Physical numbering of OS.\n");  \
+    fprintf(stdout, "\t2. Logical numbering inside node. e.g. -c N:0-3\n");  \
+    fprintf(stdout, "\t3. Logical numbering inside socket. e.g. -c S0:0-3\n");  \
+    fprintf(stdout, "\t4. Logical numbering inside last level cache group. e.g. -c C0:0-3\n");  \
+    fprintf(stdout, "\t5. Logical numbering inside NUMA domain. e.g. -c M0:0-3\n");  \
+    fprintf(stdout, "\tYou can also mix domains separated by  @, e.g. -c S0:0-3 at S1:0-3 \n\n");  \
+    fprintf(stdout, "2. Expressions based thread list generation with compact processor numbering.\n"); \
+    fprintf(stdout, "Example usage expression: likwid-pin -c E:N:8 ./myApp\n"); \
+    fprintf(stdout, "This will generate a compact list of thread to processor mapping for the node domain with eight threads.\n");  \
+    fprintf(stdout, "The following syntax variants are available:\n");  \
+    fprintf(stdout, "\t1. -c E:<thread domain>:<number of threads>\n");  \
+    fprintf(stdout, "\t2. -c E:<thread domain>:<number of threads>:<chunk size>:<stride>\n");  \
+    fprintf(stdout, "\t   For two SMT threads per core on a SMT 4 machine use e.g. -c E:N:122:2:4\n\n");  \
+    fprintf(stdout, "3. Scatter policy among thread domain type.\n"); \
+    fprintf(stdout, "Example usage scatter: likwid-pin -c M:scatter ./myApp\n"); \
+    fprintf(stdout, "This will generate a thread to processor mapping scattered among all memory domains with physical cores first.\n\n");  \
+    fprintf(stdout, "4. Logical pinning.\n"); \
+    fprintf(stdout, "Example usage logical pinning: likwid-pin -c L:0,3,4 ./myApp\n"); \
+    fprintf(stdout, "This will generate a mapping containing the processors with index 0, 3 and 4 in the currently available processor list.\n");  \
+    fprintf(stdout, "If you are running inside a cpuset (taskset, cgroup) the sorted list of allowed processors is taken as processor list.\n");  \
+    fprintf(stdout, "Example usage logical pinning inside cpuset:\n"); \
+    fprintf(stdout, "taskset -c 4,7,2,1,5 likwid-pin -c L:0,2,4 ./myApp\n"); \
+    fprintf(stdout, "This maps the application to the processors 1,4,7.\n\n");  \
+    fprintf(stdout, "If you ommit the -c option likwid will use all processors available on the node\n"); \
+    fprintf(stdout, "with physical cores first. likwid-pin will also set OMP_NUM_THREADS with as many\n"); \
+    fprintf(stdout, "threads as specified in your pin expression if OMP_NUM_THREADS is not present\n"); \
+    fprintf(stdout, "in your environment.\n\n"); \
+    fflush(stdout);
 
 #define VERSION_MSG \
-    printf("likwid-pin   %d.%d \n\n",VERSION,RELEASE)
+    fprintf(stdout, "likwid-pin   %d.%d \n\n",VERSION,RELEASE); \
+    fflush(stdout);
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
     static void
@@ -105,22 +114,23 @@ pinPid(int cpuid, int silent)
 
     status = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
 
-    if (status == -1) 
+    if (status == -1)
     {
-        printf("sched_setaffinity failed : %s \n",strerror(errno));
+        fprintf(stderr, "sched_setaffinity failed : %s \n",strerror(errno));
     }
-    else 
+    else
     {
         if(!silent)
         {
 #ifdef COLOR
             color_on(BRIGHT, COLOR);
 #endif
-            printf("[likwid-pin] Main PID -> core %d - OK",  cpuid);
+            fprintf(stdout, "[likwid-pin] Main PID -> core %d - OK",  cpuid);
 #ifdef COLOR
             color_reset();
 #endif
-            printf("\n");
+            fprintf(stdout, "\n");
+            fflush(stdout);
         }
     }
 }
@@ -144,6 +154,7 @@ int main (int argc, char** argv)
     int numThreads=0;
     int threads[MAX_NUM_THREADS];
     char delimiter = ',';
+    FILE* OUTSTREAM = stdout;
     threads[0] = 0;
 
     if (argc ==  1) {
@@ -190,13 +201,14 @@ int main (int argc, char** argv)
             case 'p':
                 if (!hasAffinity)
                 {
-                    printf("Option -p is not supported for unknown processor!\n");
+                    fprintf(stderr, "Option -p is not supported for unknown processor!\n");
                     exit(EXIT_SUCCESS);
                 }
                 optPrintDomains = 1;
                 break;
             case 'q':
                 optSilent = 1;
+                OUTSTREAM = NULL;
                 setenv("LIKWID_SILENT","true", 1);
                 break;
             case 's':
@@ -206,7 +218,7 @@ int main (int argc, char** argv)
             case 'S':
                 if (!hasAffinity)
                 {
-                    printf("Option -S is not supported for unknown processor!\n");
+                    fprintf(stderr, "Option -S is not supported for unknown processor!\n");
                     exit(EXIT_SUCCESS);
                 }
                 optMemSweep = 1;
@@ -221,24 +233,28 @@ int main (int argc, char** argv)
     }
     if (optind == argc && !optPrintDomains)
     {
-    	fprintf(stderr,"Executable must be given on commandline\n");
-    	exit(EXIT_FAILURE);
+        fprintf(stderr,"Executable must be given on commandline\n");
+        exit(EXIT_FAILURE);
     }
 
     if (optPrintDomains && numThreads)
     {
-        printf("%d",threads[0]);
-
-        for ( i=1; i< numThreads; i++)
+        if ((!optSilent) && (OUTSTREAM))
         {
-            printf("%c%d",delimiter,threads[i]);
+            fprintf(OUTSTREAM, "%d",threads[0]);
+
+            for ( i=1; i< numThreads; i++)
+            {
+                fprintf(OUTSTREAM, "%c%d",delimiter,threads[i]);
+            }
+            fprintf(OUTSTREAM, "\n");
+            fflush(OUTSTREAM);
         }
-        printf("\n");
         exit (EXIT_SUCCESS);
     }
     else if ( optPrintDomains )
     {
-        affinity_printDomains();
+        affinity_printDomains(OUTSTREAM);
         exit (EXIT_SUCCESS);
     }
 
@@ -258,14 +274,22 @@ int main (int argc, char** argv)
 
     if (optInterleaved)
     {
-        printf("Set mem_policy to interleaved\n");
+        if ((!optSilent) && (OUTSTREAM))
+        {
+            fprintf(OUTSTREAM, "Set mem_policy to interleaved\n");
+            fflush(OUTSTREAM);
+        }
         numa_setInterleaved(threads, numThreads);
     }
 
     if (optMemSweep)
     {
-        printf("Sweeping memory\n");
-        memsweep_threadGroup(threads, numThreads);
+        if ((!optSilent) && (OUTSTREAM))
+        {
+            fprintf(OUTSTREAM, "Sweeping memory\n");
+            fflush(OUTSTREAM);
+        }
+        memsweep_threadGroup(OUTSTREAM, threads, numThreads);
     }
 
     if ( getenv("OMP_NUM_THREADS") == NULL )
@@ -287,11 +311,11 @@ int main (int argc, char** argv)
 
         bformata(pinString,",%d",threads[0]);
 
-		if (skipMask >= 0)
-		{
-        	skipString = bformat("%d",skipMask);
-			setenv("LIKWID_SKIP",(char*) bdata(skipString) , 1);
-		}
+        if (skipMask >= 0)
+        {
+            skipString = bformat("%d",skipMask);
+            setenv("LIKWID_SKIP",(char*) bdata(skipString) , 1);
+        }
 
         setenv("KMP_AFFINITY", "disabled", 1);
         setenv("LIKWID_PIN",(char*) bdata(pinString) , 1);
diff --git a/src/applications/likwid-powermeter.c b/src/applications/likwid-powermeter.c
index 4843fa8..4daa393 100644
--- a/src/applications/likwid-powermeter.c
+++ b/src/applications/likwid-powermeter.c
@@ -6,8 +6,8 @@
  *      Description:  An application to get information about power 
  *      consumption on architectures implementing the RAPL interface.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -49,25 +49,28 @@
 #include <perfmon.h>
 #include <power.h>
 #include <thermal.h>
+#include <bstrlib.h>
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 
 #define HELP_MSG \
-printf("\nlikwid-powermeter --  Version  %d.%d \n\n",VERSION,RELEASE); \
-printf("A tool to print Power and Clocking information on Intel SandyBridge CPUS.\n"); \
-printf("Options:\n"); \
-printf("-h\t\t Help message\n"); \
-printf("-v\t\t Version information\n"); \
-printf("-M <0|1>\t set how MSR registers are accessed: 0=direct, 1=msrd \n"); \
-printf("-c <list>\t specify sockets to measure\n"); \
-printf("-i\t\t print information from MSR_PKG_POWER_INFO register and Turbo Mode\n"); \
-printf("-s <duration>\t set measure duration in sec. (default 2s) \n"); \
-printf("-p\t\t print dynamic clocking and CPI values (requires executable)\n\n");   \
-printf("Usage: likwid-powermeter -s 4 -c 1 \n");  \
-printf("Alternative as wrapper: likwid-powermeter -c 1 ./a.out\n")
+fprintf(stdout, "\nlikwid-powermeter --  Version  %d.%d \n\n",VERSION,RELEASE); \
+fprintf(stdout, "A tool to print Power and Clocking information on Intel SandyBridge CPUS.\n"); \
+fprintf(stdout, "Options:\n"); \
+fprintf(stdout, "-h\t\t Help message\n"); \
+fprintf(stdout, "-v\t\t Version information\n"); \
+fprintf(stdout, "-M <0|1>\t set how MSR registers are accessed: 0=direct, 1=msrd \n"); \
+fprintf(stdout, "-c <list>\t specify sockets to measure\n"); \
+fprintf(stdout, "-i\t\t print information from MSR_PKG_POWER_INFO register and Turbo Mode\n"); \
+fprintf(stdout, "-s <duration>\t set measure duration in sec. (default 2s) \n"); \
+fprintf(stdout, "-p\t\t print dynamic clocking and CPI values (requires executable)\n\n");   \
+fprintf(stdout, "Usage: likwid-powermeter -s 4 -c 1 \n");  \
+fprintf(stdout, "Alternative as wrapper: likwid-powermeter -c 1 ./a.out\n"); \
+fflush(stdout);
 
 #define VERSION_MSG \
-printf("likwid-powermeter  %d.%d \n\n",VERSION,RELEASE)
+fprintf(stdout, "likwid-powermeter  %d.%d \n\n",VERSION,RELEASE); \
+fflush(stdout);
 
 
 int main (int argc, char** argv)
@@ -77,25 +80,28 @@ int main (int argc, char** argv)
     int optClock = 0;
     int optStethoscope = 0;
     int optSockets = 0;
+    int optTemp = 0;
     double runtime;
     int hasDRAM = 0;
-    int c;
+    int hasPP0 = 0;
+    int hasPP1 = 0;
+    int c, i;
     bstring argString;
     bstring eventString = bfromcstr("CLOCK");
     int numSockets=1;
     int numThreads=0;
     int threadsSockets[MAX_NUM_NODES*2];
     int threads[MAX_NUM_THREADS];
-
+    const AffinityDomain* socketDomains[MAX_NUM_NODES*2];
     threadsSockets[0] = 0;
-    
+
     if (argc == 1)
     {
-    	HELP_MSG;
-    	exit (EXIT_SUCCESS);
+        HELP_MSG;
+        exit (EXIT_SUCCESS);
     }
 
-    while ((c = getopt (argc, argv, "+c:hiM:ps:v")) != -1)
+    while ((c = getopt (argc, argv, "+c:hiM:ps:vt")) != -1)
     {
         switch (c)
         {
@@ -128,11 +134,14 @@ int main (int argc, char** argv)
             case 'v':
                 VERSION_MSG;
                 exit (EXIT_SUCCESS);
+            case 't':
+                optTemp = 1;
+                break;
             case '?':
-            	if (optopt == 's' || optopt == 'M' || optopt == 'c')
-            	{
-            		HELP_MSG;
-            	}
+                if (optopt == 's' || optopt == 'M' || optopt == 'c')
+                {
+                    HELP_MSG;
+                }
                 else if (isprint (optopt))
                 {
                     fprintf (stderr, "Unknown option `-%c'.\n", optopt);
@@ -155,16 +164,20 @@ int main (int argc, char** argv)
         fprintf(stderr,"Access to performance counters is locked.\n");
         exit(EXIT_FAILURE);
     }
-    
     if (optClock && optind == argc)
     {
-    	fprintf(stderr,"Commandline option -p requires an executable.\n");
-    	exit(EXIT_FAILURE);
+        fprintf(stderr,"Commandline option -p requires an executable.\n");
+        exit(EXIT_FAILURE);
     }
     if (optSockets && !optStethoscope && optind == argc)
     {
-    	fprintf(stderr,"Commandline option -c requires an executable if not used in combination with -s.\n");
-    	exit(EXIT_FAILURE);
+        fprintf(stderr,"Commandline option -c requires an executable if not used in combination with -s.\n");
+        exit(EXIT_FAILURE);
+    }
+    if (optStethoscope == 0 && optind == argc && !optInfo)
+    {
+        fprintf(stderr,"Either -s <seconds> or executable must be given on commandline.\n");
+        exit(EXIT_FAILURE);
     }
 
     if (cpuid_init() == EXIT_FAILURE)
@@ -172,15 +185,27 @@ int main (int argc, char** argv)
         fprintf(stderr, "CPU not supported\n");
         exit(EXIT_FAILURE);
     }
-    
     if (numSockets > cpuid_topology.numSockets)
     {
-    	fprintf(stderr, "System has only %d sockets but %d are given on commandline\n",
-    			cpuid_topology.numSockets, numSockets);
-    	exit(EXIT_FAILURE);
+        fprintf(stderr, "System has only %d sockets but %d are given on commandline.\n",
+                        cpuid_topology.numSockets, numSockets);
+        exit(EXIT_FAILURE);
+    }
+
+    numa_init();
+    affinity_init();
+
+    for (c = 0; c < numSockets; c++)
+    {
+        if (threadsSockets[c] >= cpuid_topology.numSockets)
+        {
+            fprintf(stderr, "System has no socket %d\n", threadsSockets[c]);
+            exit(EXIT_FAILURE);
+        }
+        bstring socketStr = bformat("S%d",threadsSockets[c]);
+        socketDomains[threadsSockets[c]] = affinity_getDomain(socketStr);
     }
 
-    numa_init(); /* consider NUMA node as power unit for the moment */
     accessClient_init(&socket_fd);
     msr_init(socket_fd);
     timer_init();
@@ -191,49 +216,93 @@ int main (int argc, char** argv)
             (cpuid_info.model == IVYBRIDGE) ||
             (cpuid_info.model == IVYBRIDGE_EP) ||
             (cpuid_info.model == HASWELL) ||
+            (cpuid_info.model == HASWELL_EX) ||
             (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
             (cpuid_info.model == NEHALEM_LYNNFIELD) ||
-            (cpuid_info.model == NEHALEM_WESTMERE))
+            (cpuid_info.model == NEHALEM_WESTMERE) ||
+            (cpuid_info.model == ATOM_SILVERMONT_C) ||
+            (cpuid_info.model == ATOM_SILVERMONT_E) ||
+            (cpuid_info.model == ATOM_SILVERMONT_F1) ||
+            (cpuid_info.model == ATOM_SILVERMONT_F2) ||
+            (cpuid_info.model == ATOM_SILVERMONT_F3))
     {
-        power_init(numa_info.nodes[0].processors[0]);
+        if (numSockets == 0)
+        {
+            numSockets = numa_info.numberOfNodes;
+        }
+        for(int i=0; i<numSockets; i++)
+        {
+            power_init(socketDomains[threadsSockets[i]]->processorList[0]);
+        }
     }
     else
     {
-        fprintf (stderr, "Query Turbo Mode only supported on Intel Nehalem/Westmere/SandyBridge/IvyBridge/Haswell processors!\n");
+        fprintf (stderr, "Query Turbo Mode only supported on Intel Nehalem/Westmere/SandyBridge/IvyBridge/Haswell/Silvermont processors!\n");
         exit(EXIT_FAILURE);
     }
 
     double clock = (double) timer_getCpuClock();
 
-    printf(HLINE);
-    printf("CPU name:\t%s \n",cpuid_info.name);
-    printf("CPU clock:\t%3.2f GHz \n",  (float) clock * 1.E-09);
-    printf(HLINE);
+    fprintf(stdout, HLINE);
+    fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name);
+    fprintf(stdout, "CPU clock:\t%3.2f GHz \n",  (float) clock * 1.E-09);
+    fprintf(stdout, HLINE);
+    fflush(stdout);
 
     if (optInfo)
     {
         if (power_info.turbo.numSteps != 0)
         {
-            printf("Base clock:\t%.2f MHz \n",  power_info.baseFrequency );
-            printf("Minimal clock:\t%.2f MHz \n",  power_info.minFrequency );
-            printf("Turbo Boost Steps:\n");
+            fprintf(stdout, "Base clock:\t%.2f MHz \n",  power_info.baseFrequency );
+            fprintf(stdout, "Minimal clock:\t%.2f MHz \n",  power_info.minFrequency );
+            fprintf(stdout, "Turbo Boost Steps:\n");
             for (int i=0; i < power_info.turbo.numSteps; i++ )
             {
-                printf("C%d %.2f MHz \n",i+1,  power_info.turbo.steps[i] );
+                fprintf(stdout, "C%d %.2f MHz \n",i+1,  power_info.turbo.steps[i] );
             }
         }
-        printf(HLINE);
+        fprintf(stdout, HLINE);
+        fflush(stdout);
     }
 
-    if (cpuid_info.model == SANDYBRIDGE_EP)
+    if ((cpuid_info.model == SANDYBRIDGE_EP) ||
+        (cpuid_info.model == IVYBRIDGE_EP) ||
+        (cpuid_info.model == HASWELL_EX) ||
+        (cpuid_info.model == HASWELL))
     {
         hasDRAM = 1;
     }
-    else if ((cpuid_info.model != SANDYBRIDGE) &&
-            (cpuid_info.model != SANDYBRIDGE_EP)  &&
-            (cpuid_info.model != IVYBRIDGE)  &&
-            (cpuid_info.model != IVYBRIDGE_EP)  &&
-            (cpuid_info.model != HASWELL))
+    if ((cpuid_info.model == SANDYBRIDGE_EP) ||
+        (cpuid_info.model == SANDYBRIDGE) ||
+        (cpuid_info.model == IVYBRIDGE_EP) ||
+        (cpuid_info.model == IVYBRIDGE) ||
+        (cpuid_info.model == HASWELL) ||
+        (cpuid_info.model == ATOM_SILVERMONT_E) ||
+        (cpuid_info.model == ATOM_SILVERMONT_F1) ||
+        (cpuid_info.model == ATOM_SILVERMONT_F2) ||
+        (cpuid_info.model == ATOM_SILVERMONT_F3))
+    {
+        hasPP0 = 1;
+    }
+    if ((cpuid_info.model == HASWELL) ||
+        (cpuid_info.model == SANDYBRIDGE) ||
+        (cpuid_info.model == IVYBRIDGE))
+    {
+        hasPP1 = 1;
+    }
+    if ((cpuid_info.model != SANDYBRIDGE) &&
+        (cpuid_info.model != SANDYBRIDGE_EP)  &&
+        (cpuid_info.model != IVYBRIDGE)  &&
+        (cpuid_info.model != IVYBRIDGE_EP)  &&
+        (cpuid_info.model != HASWELL) &&
+        (cpuid_info.model != HASWELL_M1) &&
+        (cpuid_info.model != HASWELL_M2) &&
+        (cpuid_info.model != HASWELL_EX) &&
+        (cpuid_info.model != ATOM_SILVERMONT_C) &&
+        (cpuid_info.model != ATOM_SILVERMONT_E) &&
+        (cpuid_info.model != ATOM_SILVERMONT_F1) &&
+        (cpuid_info.model != ATOM_SILVERMONT_F2) &&
+        (cpuid_info.model != ATOM_SILVERMONT_F3))
     {
         fprintf (stderr, "RAPL not supported on this processor!\n");
         exit(EXIT_FAILURE);
@@ -241,21 +310,24 @@ int main (int argc, char** argv)
 
     if (optInfo)
     {
-        printf("Thermal Spec Power: %g Watts \n", power_info.tdp );
-        printf("Minimum  Power: %g Watts \n", power_info.minPower);
-        printf("Maximum  Power: %g Watts \n", power_info.maxPower);
-        printf("Maximum  Time Window: %g micro sec \n", power_info.maxTimeWindow);
-        printf(HLINE);
+        fprintf(stdout, "Thermal Spec Power: %g Watts \n", power_info.tdp );
+        fprintf(stdout, "Minimum  Power: %g Watts \n", power_info.minPower);
+        fprintf(stdout, "Maximum  Power: %g Watts \n", power_info.maxPower);
+        fprintf(stdout, "Maximum  Time Window: %g micro sec \n", power_info.maxTimeWindow);
+        fprintf(stdout, HLINE);
+        fflush(stdout);
         exit(EXIT_SUCCESS);
     }
 
     if (optClock)
     {
         affinity_init();
-        argString = bformat("S%u:0-%u", threadsSockets[0], cpuid_topology.numCoresPerSocket-1);
+        argString = bformat("S%u:0-%u", threadsSockets[0],
+                        socketDomains[threadsSockets[0]]->numberOfProcessors-1);
         for (int i=1; i<numSockets; i++)
         {
-            bstring tExpr = bformat("@S%u:0-%u", threadsSockets[i], cpuid_topology.numCoresPerSocket-1);
+            bstring tExpr = bformat("@S%u:0-%u", threadsSockets[i],
+                                socketDomains[threadsSockets[i]]->numberOfProcessors-1);
             bconcat(argString, tExpr);
         }
         numThreads = bstr_to_cpuset(threads, argString);
@@ -267,12 +339,15 @@ int main (int argc, char** argv)
     {
         PowerData pDataPkg[MAX_NUM_NODES*2];
         PowerData pDataDram[MAX_NUM_NODES*2];
-        printf("Measure on sockets: %d", threadsSockets[0]);
+        PowerData pDataPP0[MAX_NUM_NODES*2];
+        PowerData pDataPP1[MAX_NUM_NODES*2];
+        fprintf(stdout, "Measure on sockets: %d", threadsSockets[0]);
         for (int i=1; i<numSockets; i++)
         {
-            printf(", %d", threadsSockets[i]);
+            fprintf(stdout, ", %d", threadsSockets[i]);
         }
-        printf("\n");
+        fprintf(stdout, "\n");
+        fflush(stdout);
 
         if (optStethoscope)
         {
@@ -284,9 +359,11 @@ int main (int argc, char** argv)
             {
                 for (int i=0; i<numSockets; i++)
                 {
-                    int cpuId = numa_info.nodes[threadsSockets[i]].processors[0];
-                    if (hasDRAM) power_start(pDataDram+i, cpuId, DRAM);
-                    power_start(pDataPkg+i, cpuId, PKG);
+                    int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
+                    if (hasDRAM) power_start(&(pDataDram[i]), cpuId, DRAM);
+                    if (hasPP0) power_start(&(pDataPP0[i]), cpuId, PP0);
+                    if (hasPP1) power_start(&(pDataPP1[i]), cpuId, PP1);
+                    power_start(&(pDataPkg[i]), cpuId, PKG);
                 }
             }
             sleep(optStethoscope);
@@ -301,9 +378,11 @@ int main (int argc, char** argv)
             {
                 for (int i=0; i<numSockets; i++)
                 {
-                    int cpuId = numa_info.nodes[threadsSockets[i]].processors[0];
-                    power_stop(pDataPkg+i, cpuId, PKG);
-                    if (hasDRAM) power_stop(pDataDram+i, cpuId, DRAM);
+                    int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
+                    power_stop(&(pDataPkg[i]), cpuId, PKG);
+                    if (hasPP1) power_stop(&(pDataPP1[i]), cpuId, PP1);
+                    if (hasPP0) power_stop(&(pDataPP0[i]), cpuId, PP0);
+                    if (hasDRAM) power_stop(&(pDataDram[i]), cpuId, DRAM);
                 }
             }
             runtime = (double) optStethoscope;
@@ -319,7 +398,8 @@ int main (int argc, char** argv)
                 bconchar(exeString, ' ');
                 bcatcstr(exeString, argv[i]);
             }
-            printf("%s\n",bdata(exeString));
+            fprintf(stdout, "Executing: %s\n",bdata(exeString));
+            fflush(stdout);
 
 
             if (optClock)
@@ -330,9 +410,11 @@ int main (int argc, char** argv)
             {
                 for (int i=0; i<numSockets; i++)
                 {
-                    int cpuId = numa_info.nodes[threadsSockets[i]].processors[0];
-                    if (hasDRAM) power_start(pDataDram+i, cpuId, DRAM);
-                    power_start(pDataPkg+i, cpuId, PKG);
+                    int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
+                    if (hasDRAM) power_start(&(pDataDram[i]), cpuId, DRAM);
+                    if (hasPP0) power_start(&(pDataPP0[i]), cpuId, PP0);
+                    if (hasPP1) power_start(&(pDataPP1[i]), cpuId, PP1);
+                    power_start(&(pDataPkg[i]), cpuId, PKG);
                 }
 
                 timer_start(&time);
@@ -356,9 +438,11 @@ int main (int argc, char** argv)
 
                 for (int i=0; i<numSockets; i++)
                 {
-                    int cpuId = numa_info.nodes[threadsSockets[i]].processors[0];
-                    power_stop(pDataPkg+i, cpuId, PKG);
-                    if (hasDRAM) power_stop(pDataDram+i, cpuId, DRAM);
+                    int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
+                    power_stop(&(pDataPkg[i]), cpuId, PKG);
+                    if (hasDRAM) power_stop(&(pDataDram[i]), cpuId, DRAM);
+                    if (hasPP0) power_stop(&(pDataPP0[i]), cpuId, PP0);
+                    if (hasPP1) power_stop(&(pDataPP1[i]), cpuId, PP1);
                 }
                 runtime = timer_print(&time);
             }
@@ -366,39 +450,56 @@ int main (int argc, char** argv)
 
         if (!optClock)
         {
-            printf("Runtime: %g second \n",runtime);
-            printf(HLINE);
+            fprintf(stdout, "Runtime: %g second \n",runtime);
+            fprintf(stdout, HLINE);
             for (int i=0; i<numSockets; i++)
             {
-                printf("Socket %d\n",threadsSockets[i]);
-                printf("Domain: PKG \n");
-                printf("Energy consumed: %g Joules \n", power_printEnergy(pDataPkg+i));
-                printf("Power consumed: %g Watts \n", power_printEnergy(pDataPkg+i) / runtime );
+                fprintf(stdout, "Socket %d (Measured on CPU %d)\n",threadsSockets[i],
+                                    socketDomains[threadsSockets[i]]->processorList[0]);
+                fprintf(stdout, "Domain: PKG \n");
+                fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPkg[i])));
+                fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPkg[i])) / runtime );
                 if (hasDRAM)
                 {
-                    printf("Domain: DRAM \n");
-                    printf("Energy consumed: %g Joules \n", power_printEnergy(pDataDram+i));
-                    printf("Power consumed: %g Watts \n", power_printEnergy(pDataDram+i) / runtime );
+                    fprintf(stdout, "Domain: DRAM \n");
+                    fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataDram[i])));
+                    fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataDram[i])) / runtime );
                 }
-                printf("\n");
+                if (hasPP0)
+                {
+                    fprintf(stdout, "Domain: PP0 \n");
+                    fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPP0[i])));
+                    fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPP0[i])) / runtime );
+                }
+                if (hasPP1)
+                {
+                    fprintf(stdout, "Domain: PP1 \n");
+                    fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPP1[i])));
+                    fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPP1[i])) / runtime );
+                }
+                fprintf(stdout, "\n");
             }
+            fflush(stdout);
         }
     }
 
-#if 0
-    if ( cpuid_hasFeature(TM2) )
+
+    if ( optTemp && cpuid_hasFeature(TM2))
     {
-        thermal_init(0);
         printf("Current core temperatures:\n");
-
-        for (uint32_t i = 0; i < cpuid_topology.numCoresPerSocket; i++ )
+        for (i = 0; i < numSockets; i++)
         {
-            printf("Core %d: %u C\n",
-                    numa_info.nodes[socketId].processors[i],
-                    thermal_read(numa_info.nodes[socketId].processors[i]));
+            printf("Socket %d\n",threadsSockets[i]);
+            for (c = 0; c < socketDomains[threadsSockets[i]]->numberOfProcessors; c++ )
+            {
+                thermal_init(i);
+                printf("Core %d: %u C\n",
+                        socketDomains[threadsSockets[i]]->processorList[c],
+                        thermal_read(socketDomains[threadsSockets[i]]->processorList[c]));
+            }
         }
     }
-#endif
+
 
     msr_finalize();
     return EXIT_SUCCESS;
diff --git a/src/applications/likwid-topology.c b/src/applications/likwid-topology.c
index d381ef8..7ba0e33 100644
--- a/src/applications/likwid-topology.c
+++ b/src/applications/likwid-topology.c
@@ -6,8 +6,8 @@
  *      Description:  A application to determine the thread and cache topology
  *                    on x86 processors.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -51,24 +51,26 @@
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 
 #define HELP_MSG \
-printf("\nlikwid-topology --  Version %d.%d \n\n",VERSION,RELEASE); \
-printf("A tool to print the thread and cache topology on x86 CPUs.\n"); \
-printf("Options:\n"); \
-printf("-h\t Help message\n"); \
-printf("-v\t Version information\n"); \
-printf("-c\t list cache information\n"); \
-printf("-C\t measure processor clock\n"); \
-printf("-o\t Store output to file, with output conversation according to file suffix\n"); \
-printf("\t Conversation scripts can be supplied in %s\n",TOSTRING(LIKWIDFILTERPATH)); \
-printf("-g\t graphical output\n\n")
+    fprintf(OUTSTREAM, "\nlikwid-topology --  Version %d.%d \n\n",VERSION,RELEASE); \
+    fprintf(OUTSTREAM, "A tool to print the thread and cache topology on x86 CPUs.\n"); \
+    fprintf(OUTSTREAM, "Options:\n"); \
+    fprintf(OUTSTREAM, "-h\t Help message\n"); \
+    fprintf(OUTSTREAM, "-v\t Version information\n"); \
+    fprintf(OUTSTREAM, "-c\t list cache information\n"); \
+    fprintf(OUTSTREAM, "-C\t measure processor clock\n"); \
+    fprintf(OUTSTREAM, "-o\t Store output to file, with output conversion according to file suffix\n"); \
+    fprintf(OUTSTREAM, "\t Conversion scripts can be supplied in %s\n",TOSTRING(LIKWIDFILTERPATH)); \
+    fprintf(OUTSTREAM, "-g\t graphical output\n\n"); \
+    fflush(OUTSTREAM);
 
 #define VERSION_MSG \
-printf("likwid-topology  %d.%d \n\n",VERSION,RELEASE)
+    fprintf(OUTSTREAM, "likwid-topology  %d.%d \n\n",VERSION,RELEASE); \
+    fflush(OUTSTREAM);
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
 int main (int argc, char** argv)
-{ 
+{
     int optGraphical = 0;
     int optCaches = 0;
     int optClock = 0;
@@ -140,12 +142,12 @@ int main (int argc, char** argv)
     numa_init();
 
     fprintf(OUTSTREAM, HLINE);
-    fprintf(OUTSTREAM, "CPU type:\t%s \n",cpuid_info.name);
+    fprintf(OUTSTREAM, "CPU type:\t%s\n",cpuid_info.name);
 
     if (optClock)
     {
         timer_init();
-        fprintf(OUTSTREAM, "CPU clock:\t%3.2f GHz \n",  (float) timer_getCpuClock() * 1.E-09);
+        fprintf(OUTSTREAM, "CPU clock:\t%3.2f GHz\n",  (float) timer_getCpuClock() * 1.E-09);
     }
 
     /*----------------------------------------------------------------------
@@ -190,6 +192,7 @@ int main (int argc, char** argv)
         fprintf(OUTSTREAM, ")\n");
     }
     fprintf(OUTSTREAM, HLINE"\n");
+    fflush(OUTSTREAM);
 
     /*----------------------------------------------------------------------
      *  Cache Topology
@@ -208,7 +211,7 @@ int main (int argc, char** argv)
                 fprintf(OUTSTREAM, "Size:\t%d kB\n",
                         cpuid_topology.cacheLevels[i].size/1024);
             }
-            else 
+            else
             {
                 fprintf(OUTSTREAM, "Size:\t%d MB\n",
                         cpuid_topology.cacheLevels[i].size/1048576);
@@ -236,7 +239,7 @@ int main (int argc, char** argv)
                         cpuid_topology.cacheLevels[i].associativity);
                 fprintf(OUTSTREAM, "Number of sets:\t%d\n",
                         cpuid_topology.cacheLevels[i].sets);
-                fprintf(OUTSTREAM, "Cache line size:%d\n",
+                fprintf(OUTSTREAM, "Cache line size:\t%d\n",
                         cpuid_topology.cacheLevels[i].lineSize);
                 if(cpuid_topology.cacheLevels[i].inclusive)
                 {
@@ -289,6 +292,7 @@ int main (int argc, char** argv)
     }
 
     fprintf(OUTSTREAM, "\n");
+    fflush(OUTSTREAM);
 
     /*----------------------------------------------------------------------
      *  NUMA Topology
@@ -331,6 +335,7 @@ int main (int argc, char** argv)
         }
     }
     fprintf(OUTSTREAM, "\n");
+    fflush(OUTSTREAM);
 
     /*----------------------------------------------------------------------
      *  Graphical topology
@@ -345,7 +350,7 @@ int main (int argc, char** argv)
         fprintf(OUTSTREAM, SLINE);
 
         /* Allocate without instruction cache */
-        if ( cpuid_info.family == P6_FAMILY || cpuid_info.family == MIC_FAMILY ) 
+        if ( cpuid_info.family == P6_FAMILY || cpuid_info.family == MIC_FAMILY )
         {
             container = asciiBoxes_allocateContainer(
                     cpuid_topology.numCacheLevels,
@@ -384,7 +389,7 @@ int main (int argc, char** argv)
                     tmp++;
                     threadNode = tree_getNextNode(threadNode);
                 }
-                asciiBoxes_addBox(container, 0, j, boxLabel); 
+                asciiBoxes_addBox(container, 0, j, boxLabel);
                 j++;
                 coreNode = tree_getNextNode(coreNode);
             }
@@ -422,7 +427,7 @@ int main (int argc, char** argv)
                                 boxLabel = bformat("%dkB",
                                         cpuid_topology.cacheLevels[i].size/1024);
                             }
-                            else 
+                            else
                             {
                                 boxLabel = bformat("%dMB",
                                         cpuid_topology.cacheLevels[i].size/1048576);
@@ -443,17 +448,17 @@ int main (int argc, char** argv)
                                         lineCursor,
                                         columnCursor,
                                         columnCursor+cacheWidth,
-                                        boxLabel); 
+                                        boxLabel);
 
                                 columnCursor += sharedCores;
                             }
-                            else 
+                            else
                             {
                                 asciiBoxes_addBox(
                                         container,
                                         lineCursor,
                                         columnCursor,
-                                        boxLabel); 
+                                        boxLabel);
 
                                 columnCursor++;
                             }
@@ -464,7 +469,7 @@ int main (int argc, char** argv)
                 }
             }
 
-            asciiBoxes_print(container);
+            asciiBoxes_print(OUTSTREAM, container);
             socketNode = tree_getNextNode(socketNode);
         }
         bdestroy(boxLabel);
@@ -475,22 +480,23 @@ int main (int argc, char** argv)
     /* call filterscript if specified */
     if (!biseqcstr(filterScript,"NO"))
     {
-    	struct bstrList* tokens;
-    	tokens = bsplit(filterScript,' ');
-    	if (access(bdata(tokens->entry[0]), F_OK))
-    	{
-    		fprintf(stderr, "Cannot find filter %s!\n", bdata(tokens->entry[0]));
-    		bstrListDestroy(tokens);
-    		exit(EXIT_FAILURE);
-    	}
-    	if (access(bdata(tokens->entry[0]), X_OK))
-    	{
-    		fprintf(stderr, "Cannot execute filter %s!\n", bdata(tokens->entry[0]));
-    		bstrListDestroy(tokens);
-    		exit(EXIT_FAILURE);
-    	}
-    	bstrListDestroy(tokens);
+        struct bstrList* tokens;
+        tokens = bsplit(filterScript,' ');
+        if (access(bdata(tokens->entry[0]), F_OK))
+        {
+            fprintf(stderr, "Cannot find filter %s!\n", bdata(tokens->entry[0]));
+            bstrListDestroy(tokens);
+            exit(EXIT_FAILURE);
+        }
+        if (access(bdata(tokens->entry[0]), X_OK))
+        {
+            fprintf(stderr, "Cannot execute filter %s!\n", bdata(tokens->entry[0]));
+            bstrListDestroy(tokens);
+            exit(EXIT_FAILURE);
+        }
+        bstrListDestroy(tokens);
         bcatcstr(filterScript, " topology");
+
         if (system(bdata(filterScript)) == EOF)
         {
             fprintf(stderr, "Failed to execute filter %s!\n", bdata(filterScript));
diff --git a/src/asciiBoxes.c b/src/asciiBoxes.c
index c6560e3..a22dab5 100644
--- a/src/asciiBoxes.c
+++ b/src/asciiBoxes.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Module implementing output of nested ascii art boxes
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -108,7 +108,7 @@ asciiBoxes_addJoinedBox(
 }
 
 void
-asciiBoxes_print(BoxContainer* container)
+asciiBoxes_print(FILE* OUTSTREAM, BoxContainer* container)
 {
     int width;
     int boxwidth=0; /* box width is inner width of box */
@@ -251,5 +251,6 @@ asciiBoxes_print(BoxContainer* container)
         printf("-");
     }
     printf("+\n");
+    fflush(stdout);
 }
 
diff --git a/src/asciiTable.c b/src/asciiTable.c
index 3e4b508..29b615a 100644
--- a/src/asciiTable.c
+++ b/src/asciiTable.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Module implementing output of ascii table.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/barrier.c b/src/barrier.c
index c5faad4..3a93f92 100644
--- a/src/barrier.c
+++ b/src/barrier.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Implementation of threaded spin loop barrier
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -122,7 +122,7 @@ barrier_registerThread(BarrierData* barr, int groupId, int threadId)
 
 void
 barrier_init(int numberOfGroups) 
-{ 
+{
     maxGroupId = numberOfGroups-1;
     groups = (BarrierGroup*) malloc(numberOfGroups * sizeof(BarrierGroup));
 }
@@ -151,5 +151,5 @@ barrier_synchronize(BarrierData* barr)
 
 void barrier_destroy(void)
 {
-	free(groups);
+    free(groups);
 }
diff --git a/src/bench.c b/src/bench.c
index 4460552..3a0b81b 100644
--- a/src/bench.c
+++ b/src/bench.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Benchmarking framework for likwid-bench
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -43,26 +43,38 @@
 #include <affinity.h>
 #include <barrier.h>
 #include <likwid.h>
+#ifdef PAPI
+#include <papi.h>
+#endif
 
 /* #####   EXPORTED VARIABLES   ########################################### */
 
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 
-//#define BARRIER pthread_barrier_wait(&threads_barrier) 
+//#define BARRIER pthread_barrier_wait(&threads_barrier)
 #define BARRIER   barrier_synchronize(&barr)
 
 #ifdef PERFMON
 #define START_PERFMON likwid_markerStartRegion("bench");
 #define STOP_PERFMON  likwid_markerStopRegion("bench");
 #define LIKWID_THREAD_INIT  likwid_markerThreadInit();
+#define EXECUTE EXECUTE_LIKWID
+#else
+#ifdef PAPI
+#define START_PERFMON(event_set) PAPI_start(event_set);
+#define STOP_PERFMON(event_set, result) PAPI_stop ( event_set ,result );
+#define LIKWID_THREAD_INIT
+#define EXECUTE EXECUTE_PAPI
 #else
 #define START_PERFMON
 #define STOP_PERFMON
 #define LIKWID_THREAD_INIT
+#define EXECUTE EXECUTE_LIKWID
+#endif
 #endif
 
-#define EXECUTE(func)   \
+#define EXECUTE_LIKWID(func)   \
     BARRIER; \
     if (data->threadId == 0) \
     { \
@@ -80,9 +92,27 @@
         timer_stop(&time); \
         data->cycles = timer_printCycles(&time); \
     } \
-    BARRIER
-
+    BARRIER 
 
+#define EXECUTE_PAPI(func)   \
+    BARRIER; \
+    if (data->threadId == 0) \
+    { \
+        timer_start(&time); \
+    } \
+    START_PERFMON(event_set)  \
+    for (i=0; i<  data->data.iter; i++) \
+    {   \
+    func; \
+    } \
+    BARRIER; \
+    STOP_PERFMON(event_set, &(result[0]))  \
+    if (data->threadId == 0) \
+    { \
+        timer_stop(&time); \
+        data->cycles = timer_printCycles(&time); \
+    } \
+    BARRIER
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
 void* runTest(void* arg)
@@ -96,11 +126,49 @@ void* runTest(void* arg)
     ThreadUserData* myData;
     TimerData time;
     FuncPrototype func;
+    FILE* OUTSTREAM;
+#ifdef PAPI
+    int event_set = PAPI_NULL;
+    char groupname[50];
+    char* group_ptr = &(groupname[0]);
+    long long int result[4] = {0,0,0,0};
+    group_ptr = getenv("PAPI_BENCH");
+    PAPI_create_eventset(&event_set);
+    PAPI_add_event(event_set, PAPI_TOT_CYC);
+    // L3 group
+    if (strncmp(group_ptr,"L3",2) == 0)
+    {
+        PAPI_add_event(event_set, PAPI_L3_TCA);
+    }
+    // L2 group
+    else if (strncmp(group_ptr,"L2",2) == 0)
+    {
+        PAPI_add_event(event_set, PAPI_L2_TCA);
+    }
+    // FLOPS_AVX
+    else if (strncmp(group_ptr,"FLOPS_AVX",9) == 0)
+    {
+        PAPI_add_event(event_set, PAPI_VEC_SP);
+        PAPI_add_event(event_set, PAPI_VEC_DP);
+        PAPI_add_event(event_set, PAPI_FP_INS);
+    }
+    // FLOPS_DP
+    else if (strncmp(group_ptr,"FLOPS_DP",8) == 0)
+    {
+        PAPI_add_event(event_set, PAPI_DP_OPS);
+    }
+    // FLOPS_SP
+    else if (strncmp(group_ptr,"FLOPS_SP",8) == 0)
+    {
+        PAPI_add_event(event_set, PAPI_SP_OPS);
+    }
+#endif
 
     data = (ThreadData*) arg;
     myData = &(data->data);
     func = myData->test->kernel;
     threadId = data->threadId;
+    OUTSTREAM = data->output;
     barrier_registerThread(&barr, 0, data->globalThreadId);
 
     /* Prepare ptrs for thread */
@@ -111,6 +179,7 @@ void* runTest(void* arg)
 
     switch ( myData->test->type )
     {
+    	case SINGLE_RAND:
         case SINGLE:
             {
                 float* sptr;
@@ -123,6 +192,7 @@ void* runTest(void* arg)
                 }
             }
             break;
+        case DOUBLE_RAND:
         case DOUBLE:
             {
                 double* dptr;
@@ -143,21 +213,24 @@ void* runTest(void* arg)
     sleep(1);
     LIKWID_THREAD_INIT;
     BARRIER;
-    printf("Group: %d Thread %d Global Thread %d running on core %d - Vector length %llu Offset %d\n",
-            data->groupId,
-            threadId,
-            data->globalThreadId,
-            affinity_threadGetProcessorId(),
-            LLU_CAST size,
-            offset);
+    if (OUTSTREAM)
+    {
+        fprintf(OUTSTREAM, "Group: %d Thread %d Global Thread %d running on core %d - Vector length %llu Offset %d\n",
+                data->groupId,
+                threadId,
+                data->globalThreadId,
+                affinity_threadGetProcessorId(),
+                LLU_CAST size,
+                offset);
+    }
     BARRIER;
 
     /* Up to 10 streams the following registers are used for Array ptr:
      * Size rdi
-     * in Registers: rsi  rdx  rcx  r8  r9  
+     * in Registers: rsi  rdx  rcx  r8  r9
      * passed on stack, then: r10  r11  r12  r13  r14  r15
      * If more than 10 streams are used first 5 streams are in register, above 5 a macro must be used to
-     * load them from stack 
+     * load them from stack
      * */
 
     switch ( myData->test->streams ) {
@@ -440,7 +513,24 @@ void* runTest(void* arg)
         default:
             break;
     }
-
+#ifdef PAPI
+    double papi_result = 0.0;
+    // L2 & L3 group
+    if (strncmp(group_ptr,"L3",2) == 0 ||
+        strncmp(group_ptr,"L2",2) == 0)
+    {
+        papi_result = ((double)result[1]) * 64.0;
+    }
+    // FLOPS_AVX
+    else if (strncmp(group_ptr,"FLOPS",5) == 0)
+    {
+        papi_result = (double) result[1]+ (double) result[2];
+    }
+    if (OUTSTREAM)
+    {
+        fprintf(OUTSTREAM, "Thread %d Result %f\n",threadId, papi_result);
+    }
+#endif
     pthread_exit(NULL);
 }
 
diff --git a/src/bitUtil.c b/src/bitUtil.c
index 3df4d62..cdce490 100644
--- a/src/bitUtil.c
+++ b/src/bitUtil.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Utility routines manipulating bit arrays.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/cpuFeatures.c b/src/cpuFeatures.c
index 96b54d2..4733a82 100644
--- a/src/cpuFeatures.c
+++ b/src/cpuFeatures.c
@@ -9,8 +9,8 @@
  *                  Allows to turn on and off the Hardware prefetcher
  *                  available.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -60,11 +60,11 @@ CpuFeatureFlags cpuFeatureFlags;
 #define TEST_FLAG(feature,flag)  \
     if (flags & (1ULL<<(flag)))   \
     {                    \
-		cpuFeatureFlags.feature = 1; \
+        cpuFeatureFlags.feature = 1; \
     }                    \
     else                \
     {                \
-		cpuFeatureFlags.feature = 0; \
+        cpuFeatureFlags.feature = 0; \
     }
 
 
@@ -75,15 +75,15 @@ cpuFeatures_init(int cpu)
 {
     uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
 
-	TEST_FLAG(fastStrings,0);
-	TEST_FLAG(thermalControl,3);
-	TEST_FLAG(perfMonitoring,7);
-	TEST_FLAG(branchTraceStorage,11);
-	TEST_FLAG(pebs,12);
-	TEST_FLAG(speedstep,16);
-	TEST_FLAG(monitor,18);
-	TEST_FLAG(cpuidMaxVal,22);
-	TEST_FLAG(xdBit,34);
+    TEST_FLAG(fastStrings,0);
+    TEST_FLAG(thermalControl,3);
+    TEST_FLAG(perfMonitoring,7);
+    TEST_FLAG(branchTraceStorage,11);
+    TEST_FLAG(pebs,12);
+    TEST_FLAG(speedstep,16);
+    TEST_FLAG(monitor,18);
+    TEST_FLAG(cpuidMaxVal,22);
+    TEST_FLAG(xdBit,34);
 
     if ((cpuid_info.model == NEHALEM) ||
             (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
@@ -154,7 +154,7 @@ cpuFeatures_print(int cpu)
     }
     printf("Branch Trace Storage: \t\t");
 
-    if (flags & (1ULL<<11)) 
+    if (flags & (1ULL<<11))
     {
         PRINT_VALUE(RED,notsupported);
     }
@@ -164,7 +164,7 @@ cpuFeatures_print(int cpu)
     }
 
     printf("PEBS: \t\t\t\t");
-    if (flags & (1ULL<<12)) 
+    if (flags & (1ULL<<12))
     {
         PRINT_VALUE(RED,notsupported);
     }
@@ -174,7 +174,7 @@ cpuFeatures_print(int cpu)
     }
 
     printf("Intel Enhanced SpeedStep: \t");
-    if (flags & (1ULL<<16)) 
+    if (flags & (1ULL<<16))
     {
         PRINT_VALUE(GREEN,enabled);
     }
@@ -184,7 +184,7 @@ cpuFeatures_print(int cpu)
     }
 
     printf("MONITOR/MWAIT: \t\t\t");
-    if (flags & (1ULL<<18)) 
+    if (flags & (1ULL<<18))
     {
         PRINT_VALUE(GREEN,supported);
     }
@@ -194,7 +194,7 @@ cpuFeatures_print(int cpu)
     }
 
     printf("Limit CPUID Maxval: \t\t");
-    if (flags & (1ULL<<22)) 
+    if (flags & (1ULL<<22))
     {
         PRINT_VALUE(RED,enabled);
     }
@@ -204,7 +204,7 @@ cpuFeatures_print(int cpu)
     }
 
     printf("XD Bit Disable: \t\t");
-    if (flags & (1ULL<<34)) 
+    if (flags & (1ULL<<34))
     {
         PRINT_VALUE(RED,disabled);
     }
@@ -212,45 +212,53 @@ cpuFeatures_print(int cpu)
     {
         PRINT_VALUE(GREEN,enabled);
     }
-
-    printf("IP Prefetcher: \t\t\t");
-    if (flags & (1ULL<<39)) 
-    {
-        PRINT_VALUE(RED,disabled);
-    }
-    else
-    {
-        PRINT_VALUE(GREEN,enabled);
-    }
-
-    printf("Hardware Prefetcher: \t\t");
-    if (flags & (1ULL<<9)) 
-    {
-        PRINT_VALUE(RED,disabled);
-    }
-    else
+    if ((cpuid_info.model == NEHALEM) ||
+            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+            (cpuid_info.model == NEHALEM_WESTMERE) ||
+            (cpuid_info.model == NEHALEM_WESTMERE_M) ||
+            (cpuid_info.model == NEHALEM_EX) ||
+            (cpuid_info.model == CORE2_45) ||
+            (cpuid_info.model == CORE2_65))
     {
-        PRINT_VALUE(GREEN,enabled);
-    }
+        printf("IP Prefetcher: \t\t\t");
+        if (flags & (1ULL<<39))
+        {
+            PRINT_VALUE(RED,disabled);
+        }
+        else
+        {
+            PRINT_VALUE(GREEN,enabled);
+        }
 
-    printf("Adjacent Cache Line Prefetch: \t");
-    if (flags & (1ULL<<19)) 
-    {
-        PRINT_VALUE(RED,disabled);
-    }
-    else
-    {
-        PRINT_VALUE(GREEN,enabled);
-    }
+        printf("Hardware Prefetcher: \t\t");
+        if (flags & (1ULL<<9))
+        {
+            PRINT_VALUE(RED,disabled);
+        }
+        else
+        {
+            PRINT_VALUE(GREEN,enabled);
+        }
+        printf("Adjacent Cache Line Prefetch: \t");
+        if (flags & (1ULL<<19))
+        {
+            PRINT_VALUE(RED,disabled);
+        }
+        else
+        {
+            PRINT_VALUE(GREEN,enabled);
+        }
 
-    printf("DCU Prefetcher: \t\t");
-    if (flags & (1ULL<<37)) 
-    {
-        PRINT_VALUE(RED,disabled);
-    }
-    else
-    {
-        PRINT_VALUE(GREEN,enabled);
+        printf("DCU Prefetcher: \t\t");
+        if (flags & (1ULL<<37))
+        {
+            PRINT_VALUE(RED,disabled);
+        }
+        else
+        {
+            PRINT_VALUE(GREEN,enabled);
+        }
     }
 
     if ((cpuid_info.model == NEHALEM) ||
@@ -260,12 +268,12 @@ cpuFeatures_print(int cpu)
             (cpuid_info.model == NEHALEM_WESTMERE_M) ||
             (cpuid_info.model == NEHALEM_EX))
     {
-        printf("Intel Turbo Mode: \t");
-        if (flags & (1ULL<<38)) 
+        printf("Intel Turbo Mode: \t\t");
+        if (flags & (1ULL<<38))
         {
             PRINT_VALUE(RED,disabled);
         }
-        else 
+        else
         {
             PRINT_VALUE(GREEN,enabled);
         }
@@ -275,11 +283,11 @@ cpuFeatures_print(int cpu)
     {
 
         printf("Intel Dynamic Acceleration: \t");
-        if (flags & (1ULL<<38)) 
+        if (flags & (1ULL<<38))
         {
             PRINT_VALUE(RED,disabled);
         }
-        else 
+        else
         {
             PRINT_VALUE(GREEN,enabled);
         }
@@ -288,78 +296,104 @@ cpuFeatures_print(int cpu)
     printf(HLINE);
 }
 
-void 
+void
 cpuFeatures_enable(int cpu, CpuFeature type)
 {
-    uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
-
-    switch ( type )
+    if ((cpuid_info.model == NEHALEM) ||
+            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+            (cpuid_info.model == NEHALEM_WESTMERE) ||
+            (cpuid_info.model == NEHALEM_WESTMERE_M) ||
+            (cpuid_info.model == NEHALEM_EX) ||
+            (cpuid_info.model == CORE2_45) ||
+            (cpuid_info.model == CORE2_65))
     {
-        case HW_PREFETCHER:
-            printf("HW_PREFETCHER:\t");
-            flags &= ~(1ULL<<9);
-            break;
-
-        case CL_PREFETCHER:
-            printf("CL_PREFETCHER:\t");
-            flags &= ~(1ULL<<19);
-            break;
-
-        case DCU_PREFETCHER:
-            printf("DCU_PREFETCHER:\t");
-            flags &= ~(1ULL<<37);
-            break;
-
-        case IP_PREFETCHER:
-            printf("IP_PREFETCHER:\t");
-            flags &= ~(1ULL<<39);
-            break;
-
-        default:
-            printf("ERROR: CpuFeature not supported!\n");
-            break;
+        uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
+        switch ( type )
+        {
+            case HW_PREFETCHER:
+                printf("HW_PREFETCHER:\t");
+                flags &= ~(1ULL<<9);
+                break;
+
+            case CL_PREFETCHER:
+                printf("CL_PREFETCHER:\t");
+                flags &= ~(1ULL<<19);
+                break;
+
+            case DCU_PREFETCHER:
+                printf("DCU_PREFETCHER:\t");
+                flags &= ~(1ULL<<37);
+                break;
+
+            case IP_PREFETCHER:
+                printf("IP_PREFETCHER:\t");
+                flags &= ~(1ULL<<39);
+                break;
+
+            default:
+                printf("ERROR: CpuFeature not supported!\n");
+                break;
+        }
+        PRINT_VALUE(GREEN,enabled);
+        printf("\n");
+        msr_write(cpu, MSR_IA32_MISC_ENABLE, flags);
+    }
+    else
+    {
+        printf("ERROR: Architecture does not support the manipulation of prefetchers\n");
     }
-    PRINT_VALUE(GREEN,enabled);
-    printf("\n");
-
-    msr_write(cpu, MSR_IA32_MISC_ENABLE, flags);
 }
 
 
 void
 cpuFeatures_disable(int cpu, CpuFeature type)
 {
-    uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
+    if ((cpuid_info.model == NEHALEM) ||
+            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+            (cpuid_info.model == NEHALEM_WESTMERE) ||
+            (cpuid_info.model == NEHALEM_WESTMERE_M) ||
+            (cpuid_info.model == NEHALEM_EX) ||
+            (cpuid_info.model == CORE2_45) ||
+            (cpuid_info.model == CORE2_65))
+    {
+        uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
 
-    switch ( type ) 
+        switch ( type )
+        {
+            case HW_PREFETCHER:
+                printf("HW_PREFETCHER:\t");
+                flags |= (1ULL<<9);
+                break;
+
+            case CL_PREFETCHER:
+                printf("CL_PREFETCHER:\t");
+                flags |= (1ULL<<19);
+                break;
+
+            case DCU_PREFETCHER:
+                printf("DCU_PREFETCHER:\t");
+                flags |= (1ULL<<37);
+                break;
+
+            case IP_PREFETCHER:
+                printf("IP_PREFETCHER:\t");
+                flags |= (1ULL<<39);
+                break;
+
+            default:
+                printf("ERROR: CpuFeature not supported!\n");
+                break;
+        }
+        PRINT_VALUE(RED,disabled);
+        printf("\n");
+
+        msr_write(cpu, MSR_IA32_MISC_ENABLE, flags);
+    }
+    else
     {
-        case HW_PREFETCHER:
-            printf("HW_PREFETCHER:\t");
-            flags |= (1ULL<<9);
-            break;
-
-        case CL_PREFETCHER:
-            printf("CL_PREFETCHER:\t");
-            flags |= (1ULL<<19);
-            break;
-
-        case DCU_PREFETCHER:
-            printf("DCU_PREFETCHER:\t");
-            flags |= (1ULL<<37);
-            break;
-
-        case IP_PREFETCHER:
-            printf("IP_PREFETCHER:\t");
-            flags |= (1ULL<<39);
-            break;
-
-        default:
-            printf("ERROR: CpuFeature not supported!\n");
-            break;
+        printf("ERROR: Architecture does not support the manipulation of prefetchers\n");
     }
-    PRINT_VALUE(RED,disabled);
-    printf("\n");
-
-    msr_write(cpu, MSR_IA32_MISC_ENABLE, flags);
 }
 
diff --git a/src/cpuid.c b/src/cpuid.c
index fec599b..6a9ac47 100644
--- a/src/cpuid.c
+++ b/src/cpuid.c
@@ -6,8 +6,8 @@
  *      Description:  Implementation of cpuid module.
  *                  Provides API to extract cpuid info on x86 processors.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -39,6 +39,7 @@
 #include <unistd.h>
 #include <sched.h>
 #include <time.h>
+#include <math.h>
 
 #include <error.h>
 #include <cpuid.h>
@@ -78,6 +79,8 @@ static char* core_2b_str = "Intel Core 2 45nm processor";
 static char* atom_45_str = "Intel Atom 45nm processor";
 static char* atom_32_str = "Intel Atom 32nm processor";
 static char* atom_22_str = "Intel Atom 22nm processor";
+static char* atom_silvermont_str = "Intel Atom (Silvermont) 22nm processor";
+static char* atom_saltwell_str = "Intel Atom (Saltwell) 32nm processor";
 static char* nehalem_bloom_str = "Intel Core Bloomfield processor";
 static char* nehalem_lynn_str = "Intel Core Lynnfield processor";
 static char* nehalem_west_str = "Intel Core Westmere processor";
@@ -86,6 +89,7 @@ static char* ivybridge_str = "Intel Core IvyBridge processor";
 static char* ivybridge_ep_str = "Intel Core IvyBridge EP processor";
 static char* sandybridge_ep_str = "Intel Core SandyBridge EP processor";
 static char* haswell_str = "Intel Core Haswell processor";
+static char* haswell_ex_str = "Intel Core Haswell EX processor";
 static char* nehalem_ex_str = "Intel Nehalem EX processor";
 static char* westmere_ex_str = "Intel Westmere EX processor";
 static char* xeon_mp_string = "Intel Xeon MP processor";
@@ -224,6 +228,9 @@ static int intelCpuidFunc_4(CacheLevel** cachePool)
     int maxNumLevels=0;
     uint32_t valid=1;
     CacheLevel* pool;
+    int threadsPerCpu = 0;
+    int numThreadsPerSocket = cpuid_topology.numCoresPerSocket *
+                              cpuid_topology.numThreadsPerCore;
 
     while (valid)
     {
@@ -257,6 +264,7 @@ static int intelCpuidFunc_4(CacheLevel** cachePool)
             pool[i].associativity *
             pool[i].lineSize;
         pool[i].threads = extractBitField(eax,10,14)+1;
+        pool[i].inclusive = edx&0x2;
 
         /* WORKAROUND cpuid reports wrong number of threads on SMT processor with SMT
          * turned off */
@@ -285,21 +293,60 @@ static int intelCpuidFunc_4(CacheLevel** cachePool)
         }
 
         /* :WORKAROUND:08/13/2009 08:34:15 AM:jt: For L3 caches the value is sometimes 
-         * too large in here. Ask Intel what is wrong here!
-         * Limit threads per Socket then to the maximum possible value.*/
-        if(pool[i].threads > (int)
-                (cpuid_topology.numCoresPerSocket*
-                 cpuid_topology.numThreadsPerCore))
+         * too large in here. 
+         * See Documentation: Threads contains maximum number of threads supported
+         * by the cache.
+         * Limit threads per Socket then to the maximum possible value. If the number
+         * of threads supported by the cache does not divide the threads on the socket
+         * without remainder, the threads are adjusted to fit the multiple caches.
+         */
+        if(pool[i].threads > numThreadsPerSocket)
         {
-            pool[i].threads = cpuid_topology.numCoresPerSocket*
-                cpuid_topology.numThreadsPerCore;
+            pool[i].threads = numThreadsPerSocket;
+        }
+        else if (((double)numThreadsPerSocket)/((double)pool[i].threads) != 
+                  (double)(numThreadsPerSocket/pool[i].threads))
+        {
+            pool[i].threads = numThreadsPerSocket/
+                (int)ceil(((double)numThreadsPerSocket)/((double)pool[i].threads));
+        }
+        /* For Intel Silvermont this is not enough. It returns 4 threads and 8 cores
+         * for the L2 cache. But according to the data sheet, each 1MB L2 cache slice 
+         * is shared by 2 threads/cores.
+         */
+        else if (pool[i].level == 2 && 
+                ((cpuid_info.model == ATOM_SILVERMONT_C) ||
+                 (cpuid_info.model == ATOM_SILVERMONT_E) ||
+                 (cpuid_info.model == ATOM_SILVERMONT_F1) ||
+                 (cpuid_info.model == ATOM_SILVERMONT_F2) ||
+                 (cpuid_info.model == ATOM_SILVERMONT_F3)))
+        {
+            pool[i].threads = 2;
         }
-        pool[i].inclusive = edx&0x2;
     }
 
+    
+
     return maxNumLevels;
 }
 
+static int recheck_numHWThreads()
+{
+    int cpucount = 0;
+    char line[1024];
+    FILE* fp = fopen("/proc/cpuinfo","r");
+    if (fp != NULL)
+    {
+        while( fgets(line,1024,fp) )
+        {
+            if (strncmp(line, "processor", 9) == 0)
+            {
+                cpucount++;
+            }
+        }
+    }
+    return cpucount;
+}
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
@@ -383,15 +430,16 @@ int cpuid_init (void)
 
                 case HASWELL:
 
-                case HASWELL_EX:
-
                 case HASWELL_M1:
 
                 case HASWELL_M2:
-
                     cpuid_info.name = haswell_str;
                     break;
 
+                case HASWELL_EX:
+                    cpuid_info.name = haswell_ex_str;
+                    break;
+
                 case NEHALEM_EX:
                     cpuid_info.name = nehalem_ex_str;
                     break;
@@ -418,6 +466,14 @@ int cpuid_init (void)
                     cpuid_info.name = atom_22_str;
                     break;
 
+                case ATOM_SILVERMONT_C:
+                case ATOM_SILVERMONT_E:
+                case ATOM_SILVERMONT_F1:
+                case ATOM_SILVERMONT_F2:
+                case ATOM_SILVERMONT_F3:
+                    cpuid_info.name = atom_silvermont_str;
+                    break;
+
                 default:
                     cpuid_info.name = unknown_intel_str;
                     break;
@@ -585,20 +641,19 @@ int cpuid_init (void)
         strcat(cpuid_info.features, "RDRAND ");
         cpuid_info.featureFlags |= (1<<RDRAND);
     }
-
     if (edx & (1<<22))
     {
-        strcpy(cpuid_info.features, "ACPI ");
+        strcat(cpuid_info.features, "ACPI ");
         cpuid_info.featureFlags |= (1<<ACPI);
     }
     if (edx & (1<<23))
     {
-        strcpy(cpuid_info.features, "MMX ");
+        strcat(cpuid_info.features, "MMX ");
         cpuid_info.featureFlags |= (1<<MMX);
     }
     if (edx & (1<<25))
     {
-        strcpy(cpuid_info.features, "SSE ");
+        strcat(cpuid_info.features, "SSE ");
         cpuid_info.featureFlags |= (1<<SSE);
     }
     if (edx & (1<<26))
@@ -647,13 +702,17 @@ int cpuid_init (void)
 
     if ((file = fopen(filepath, "rb")) != NULL) 
     {
-        printf("Read config from file\n");
+        //printf("Read config from file\n");
         initTopology(file);
         fclose(file);
     }
     else
     {
         cpuid_topology.numHWThreads = sysconf(_SC_NPROCESSORS_CONF);
+        if (recheck_numHWThreads() != cpuid_topology.numHWThreads)
+        {
+            cpuid_topology.numHWThreads = recheck_numHWThreads();
+        }
         cpu_set_t cpuSet;
         CPU_ZERO(&cpuSet);
         sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
@@ -679,13 +738,16 @@ void cpuid_print (void)
     printf("\t%s\n",nehalem_bloom_str);
     printf("\t%s\n",nehalem_lynn_str);
     printf("\t%s\n",nehalem_west_str);
-    printf("\t%s\n",nehalem_ex_str);
-    printf("\t%s\n",westmere_ex_str);
+    printf("\t%s (with Uncore support)\n",nehalem_ex_str);
+    printf("\t%s (with Uncore support)\n",westmere_ex_str);
     printf("\t%s\n",sandybridge_str);
-    printf("\t%s\n",sandybridge_ep_str);
+    printf("\t%s (with Uncore support)\n",sandybridge_ep_str);
     printf("\t%s\n",ivybridge_str);
-    printf("\t%s\n",ivybridge_ep_str);
-    printf("\t%s\n",haswell_str);
+    printf("\t%s (with Uncore support)\n",ivybridge_ep_str);
+    printf("\t%s (with Uncore support)\n",haswell_str);
+    printf("\t%s (no Uncore support)\n",haswell_ex_str);
+    printf("\t%s\n",atom_silvermont_str);
+    printf("\t%s\n",atom_saltwell_str);
     printf("\t%s\n\n",xeon_phi_string);
 
     printf("Supported AMD processors:\n");
diff --git a/src/daemon.c b/src/daemon.c
index 8cf4150..de5bfa5 100644
--- a/src/daemon.c
+++ b/src/daemon.c
@@ -5,8 +5,8 @@
  *
  *      Description:  C Module implementing a daemon time loop
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -41,64 +41,82 @@
 #include <perfmon.h>
 #include <daemon.h>
 
-static int daemon_run = 0;
+static volatile int daemon_run = 0;
 static bstring eventString;
 static TimerData timeData;
+static pid_t daemonpid = 0;
 
 
 void
-daemon_init(bstring str)
+daemon_start(bstring str, struct timespec interval)
 {
-    eventString = bstrcpy(str);
-    signal(SIGINT, daemon_stop);
-    signal(SIGUSR1, daemon_interrupt);
-
-}
-
-void
-daemon_start(struct timespec interval)
-{
-    daemon_run = 1;
-    perfmon_startCounters();
-    timer_start(&timeData);
-
-    while (1)
+    daemonpid = fork();
+    if (daemonpid == 0)
     {
-        if (daemon_run)
+        eventString = bstrcpy(str);
+        signal(SIGINT, daemon_interrupt);
+        signal(SIGUSR1, daemon_interrupt);
+        daemon_run = 1;
+        perfmon_setupEventSet(eventString, NULL);
+        perfmon_startCounters();
+        timer_start(&timeData);
+
+        while (1)
         {
-            timer_stop(&timeData);
-            perfmon_readCounters();
-            perfmon_logCounterResults( timer_print(&timeData) );
-            timer_start(&timeData);
+            if (daemon_run)
+            {
+                timer_stop(&timeData);
+                perfmon_readCounters();
+                perfmon_logCounterResults( timer_print(&timeData) );
+                timer_start(&timeData);
+            }
+            else
+            {
+                break;
+            }
+            nanosleep( &interval, NULL);
         }
-        nanosleep( &interval, NULL);
+        signal(SIGINT, SIG_DFL);
+        signal(SIGUSR1, SIG_DFL);
+        exit(EXIT_SUCCESS);
     }
 }
 
 void
 daemon_stop(int sig)
 {
-    printf("DAEMON:  EXIT on %d\n", sig);
-    perfmon_stopCounters();
-    signal(SIGINT, SIG_DFL);
-    kill(getpid(), SIGINT);
+    if (daemonpid > 0)
+    {
+        printf("PARENT: KILL daemon with signal %d\n", sig);
+        kill(daemonpid, sig);
+        //perfmon_stopCounters();
+    }
 }
 
 void
 daemon_interrupt(int sig)
 {
-    if (daemon_run)
+    if (sig == SIGUSR1)
     {
-        perfmon_stopCounters();
-        daemon_run = 0;
-        printf("DAEMON:  STOP on %d\n",sig);
-    }
-    else
+        if (daemon_run)
+        {
+            perfmon_stopCounters();
+            daemon_run = 0;
+            printf("DAEMON: STOP on %d\n",sig);
+            exit(EXIT_SUCCESS);
+        }
+        else
+        {
+            perfmon_setupEventSet(eventString, NULL);
+            perfmon_startCounters();
+            daemon_run = 1;
+            printf("DAEMON: START with events %s\n",bdata(eventString));
+        }
+    } else
     {
-        perfmon_setupEventSet(eventString, NULL);
-        perfmon_startCounters();
-        daemon_run = 1;
-        printf("DAEMON:  START\n");
+        printf("DAEMON: EXIT on %d\n", sig);
+        daemon_run = 0;
+        exit(EXIT_SUCCESS);
     }
 }
 
diff --git a/src/ghash.c b/src/ghash.c
index 91a61bc..87e0ed0 100644
--- a/src/ghash.c
+++ b/src/ghash.c
@@ -1,6 +1,5 @@
 /*
  * =======================================================================================
- *      Copyright (C) 2014 Jan Treibig
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/hashTable.c b/src/hashTable.c
index b5711cb..bf6c3d8 100644
--- a/src/hashTable.c
+++ b/src/hashTable.c
@@ -6,8 +6,8 @@
  *      Description: Hashtable implementation based on SGLIB.
  *                   Used for Marker API result handling.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -86,7 +86,11 @@ int hashTable_get(bstring label, LikwidThreadResults** resEntry)
         (*resEntry)->label = bstrcpy (label);
         (*resEntry)->time = 0.0;
         (*resEntry)->count = 0;
-        for (int i=0; i< NUM_PMC; i++) (*resEntry)->PMcounters[i] = 0.0;
+        for (int i=0; i< NUM_PMC; i++) 
+        {
+            (*resEntry)->PMcounters[i] = 0.0;
+            (*resEntry)->StartPMcounters[i] = 0.0;
+        }
 
         g_hash_table_insert(
                 resPtr->hashTable,
diff --git a/src/includes/accessClient.h b/src/includes/accessClient.h
index 1c4fefe..0058182 100644
--- a/src/includes/accessClient.h
+++ b/src/includes/accessClient.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File accessClient Module. 
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/accessClient_types.h b/src/includes/accessClient_types.h
index 98610d7..a0c7a84 100644
--- a/src/includes/accessClient_types.h
+++ b/src/includes/accessClient_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for accessClient module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/affinity.h b/src/includes/affinity.h
index 93814e7..f347e64 100644
--- a/src/includes/affinity.h
+++ b/src/includes/affinity.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File affinity Module
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -42,7 +42,7 @@ extern int  affinity_threadGetProcessorId();
 extern void  affinity_pinProcess(int processorId);
 extern void  affinity_pinThread(int processorId);
 extern const AffinityDomain* affinity_getDomain(bstring domain);
-extern void affinity_printDomains();
+extern void affinity_printDomains(FILE* OUTSTREAM);
 
 #endif /*AFFINITY_H*/
 
diff --git a/src/includes/affinity_types.h b/src/includes/affinity_types.h
index 3527b75..2b08bfe 100644
--- a/src/includes/affinity_types.h
+++ b/src/includes/affinity_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Type Definitions for affinity Module
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -35,7 +35,7 @@ typedef struct {
     bstring tag;
     uint32_t numberOfProcessors;
     uint32_t numberOfCores;
-    int*  processorList;
+    int* processorList;
 } AffinityDomain;
 
 
diff --git a/src/includes/allocator.h b/src/includes/allocator.h
index 47acb8d..a21555c 100644
--- a/src/includes/allocator.h
+++ b/src/includes/allocator.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File allocator Module. 
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  none
@@ -36,12 +36,13 @@
 
 extern void allocator_init(int numVectors);
 extern void allocator_finalize();
-extern void allocator_allocateVector(void** ptr,
-        int alignment,
-        uint64_t size,
-        int offset,
-        DataType type,
-        bstring domain);
+extern void allocator_allocateVector(FILE* OUTSTREAM,
+                                     void** ptr,
+                                     int alignment,
+                                     uint64_t size,
+                                     int offset,
+                                     DataType type,
+                                     bstring domain);
 
 #endif /*ALLOCATOR_H*/
 
diff --git a/src/includes/asciiBoxes.h b/src/includes/asciiBoxes.h
index 3e82632..dd37a05 100644
--- a/src/includes/asciiBoxes.h
+++ b/src/includes/asciiBoxes.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Module to draw nested ascii art boxes.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -37,6 +37,6 @@
 extern BoxContainer* asciiBoxes_allocateContainer(int numLines,int numColumns);
 extern void asciiBoxes_addBox(BoxContainer* container, int line, int column, bstring label);
 extern void asciiBoxes_addJoinedBox(BoxContainer* container, int line, int startColumn, int endColumn, bstring label);
-extern void asciiBoxes_print(BoxContainer* container);
+extern void asciiBoxes_print(FILE* OUTSTREAM, BoxContainer* container);
 
 #endif /*ASCIIBOXES_H*/
diff --git a/src/includes/asciiBoxes_types.h b/src/includes/asciiBoxes_types.h
index 42347a1..f09c4b3 100644
--- a/src/includes/asciiBoxes_types.h
+++ b/src/includes/asciiBoxes_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for asciiBoxes module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/asciiTable.h b/src/includes/asciiTable.h
index 399a3fe..6096c4a 100644
--- a/src/includes/asciiTable.h
+++ b/src/includes/asciiTable.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Module to create and print a ascii table
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/asciiTable_types.h b/src/includes/asciiTable_types.h
index 639fd08..986a8a2 100644
--- a/src/includes/asciiTable_types.h
+++ b/src/includes/asciiTable_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for asciiTable module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/barrier.h b/src/includes/barrier.h
index cf07624..5f4142d 100644
--- a/src/includes/barrier.h
+++ b/src/includes/barrier.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File barrier Module
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/barrier_types.h b/src/includes/barrier_types.h
index 87327ff..d0abb55 100644
--- a/src/includes/barrier_types.h
+++ b/src/includes/barrier_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Type Definitions for barrier Module
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/bitUtil.h b/src/includes/bitUtil.h
index 8e87e69..c876eea 100644
--- a/src/includes/bitUtil.h
+++ b/src/includes/bitUtil.h
@@ -6,8 +6,8 @@
  *      Description:  Header File bitUtil Module. 
  *                    Helper routines for dealing with bit manipulations
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/cpuFeatures.h b/src/includes/cpuFeatures.h
index bca9211..9274e40 100644
--- a/src/includes/cpuFeatures.h
+++ b/src/includes/cpuFeatures.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File of Module cpuFeatures.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/cpuFeatures_types.h b/src/includes/cpuFeatures_types.h
index a6f30d9..3e7ec5d 100644
--- a/src/includes/cpuFeatures_types.h
+++ b/src/includes/cpuFeatures_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for CpuFeature module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -38,23 +38,23 @@ typedef enum {
     IP_PREFETCHER} CpuFeature;
 
 typedef struct {
-	unsigned int fastStrings:1;
-	unsigned int thermalControl:1;
-	unsigned int perfMonitoring:1;
-	unsigned int hardwarePrefetcher:1;
-	unsigned int ferrMultiplex:1;
-	unsigned int branchTraceStorage:1;
-	unsigned int pebs:1;
-	unsigned int speedstep:1;
-	unsigned int monitor:1;
-	unsigned int clPrefetcher:1;
-	unsigned int speedstepLock:1;
-	unsigned int cpuidMaxVal:1;
-	unsigned int xdBit:1;
-	unsigned int dcuPrefetcher:1;
-	unsigned int dynamicAcceleration:1;
-	unsigned int turboMode:1;
-	unsigned int ipPrefetcher:1;
+    unsigned int fastStrings:1;
+    unsigned int thermalControl:1;
+    unsigned int perfMonitoring:1;
+    unsigned int hardwarePrefetcher:1;
+    unsigned int ferrMultiplex:1;
+    unsigned int branchTraceStorage:1;
+    unsigned int pebs:1;
+    unsigned int speedstep:1;
+    unsigned int monitor:1;
+    unsigned int clPrefetcher:1;
+    unsigned int speedstepLock:1;
+    unsigned int cpuidMaxVal:1;
+    unsigned int xdBit:1;
+    unsigned int dcuPrefetcher:1;
+    unsigned int dynamicAcceleration:1;
+    unsigned int turboMode:1;
+    unsigned int ipPrefetcher:1;
     } CpuFeatureFlags;
 
 
diff --git a/src/includes/cpuid.h b/src/includes/cpuid.h
index c8db288..80c426a 100644
--- a/src/includes/cpuid.h
+++ b/src/includes/cpuid.h
@@ -7,8 +7,8 @@
  *                    Reads out cpuid information and initilaizes a global 
  *                    data structure cpuid_info.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -45,6 +45,11 @@
 #define ATOM_45              0x26U
 #define ATOM_32              0x36U
 #define ATOM_22              0x27U
+#define ATOM_SILVERMONT_E    0x37U
+#define ATOM_SILVERMONT_C    0x4DU
+#define ATOM_SILVERMONT_F1   0x4AU
+#define ATOM_SILVERMONT_F2   0x5AU
+#define ATOM_SILVERMONT_F3   0x5DU
 #define NEHALEM              0x1AU
 #define NEHALEM_BLOOMFIELD   0x1AU
 #define NEHALEM_LYNNFIELD    0x1EU
@@ -118,7 +123,7 @@ extern int  cpuid_isInCpuset(void);
 
 static inline int cpuid_hasFeature(FeatureBit bit)
 {
-  return (cpuid_info.featureFlags & (1<<bit));
+    return (cpuid_info.featureFlags & (1<<bit));
 }
 
 
diff --git a/src/includes/cpuid_types.h b/src/includes/cpuid_types.h
index 4c59ccd..cccc22d 100644
--- a/src/includes/cpuid_types.h
+++ b/src/includes/cpuid_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for cpuid module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -79,6 +79,7 @@ typedef struct {
     uint32_t perf_num_ctr;
     uint32_t perf_width_ctr;
     uint32_t perf_num_fixed_ctr;
+    int supportUncore;
 } CpuInfo;
 
 typedef struct {
diff --git a/src/includes/daemon.h b/src/includes/daemon.h
index 753507f..3272636 100644
--- a/src/includes/daemon.h
+++ b/src/includes/daemon.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File daemon Module. 
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -35,7 +35,7 @@
 #include <time.h>
 
 extern void daemon_init();
-extern void daemon_start(struct timespec interval);
+extern void daemon_start(bstring str, struct timespec interval);
 extern void daemon_stop(int sig);
 extern void daemon_interrupt(int sig);
 
diff --git a/src/includes/error.h b/src/includes/error.h
index c726a7c..3c1526f 100644
--- a/src/includes/error.h
+++ b/src/includes/error.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Central error handling macros
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/ghash.h b/src/includes/ghash.h
index f0c233a..f33e9fb 100644
--- a/src/includes/ghash.h
+++ b/src/includes/ghash.h
@@ -59,13 +59,13 @@ typedef struct _GHashTableIter GHashTableIter;
 
 struct _GHashTableIter
 {
-  /*< private >*/
-  gpointer      dummy1;
-  gpointer      dummy2;
-  gpointer      dummy3;
-  int           dummy4;
-  gboolean      dummy5;
-  gpointer      dummy6;
+    /*< private >*/
+    gpointer      dummy1;
+    gpointer      dummy2;
+    gpointer      dummy3;
+    int           dummy4;
+    gboolean      dummy5;
+    gpointer      dummy6;
 };
 
 char* g_strdup (const char *str);
diff --git a/src/includes/hashTable.h b/src/includes/hashTable.h
index 713a3bd..078fff9 100644
--- a/src/includes/hashTable.h
+++ b/src/includes/hashTable.h
@@ -7,8 +7,8 @@
  *                    Wrapper for HAshTable data structure holding thread
  *                    specific region information.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/libperfctr_types.h b/src/includes/libperfctr_types.h
index f757d0f..99a38dc 100644
--- a/src/includes/libperfctr_types.h
+++ b/src/includes/libperfctr_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for libperfctr module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/likwid.h b/src/includes/likwid.h
index 287582e..dd4cdfd 100644
--- a/src/includes/likwid.h
+++ b/src/includes/likwid.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File of likwid marker API
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/lock.h b/src/includes/lock.h
index 3684f4c..87d1593 100644
--- a/src/includes/lock.h
+++ b/src/includes/lock.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File Locking primitive Module
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -60,40 +60,40 @@ static int lock_check(void)
 
     if ((lock_handle = open(filepath, O_RDONLY )) == -1 )
     {
-	if (errno == ENOENT)
-	{
-	    /* There is no lock file. Proceed. */
-	    result = 1;
-	}
-	else if (errno == EACCES)
-	{
-	    /* There is a lock file. We cannot open it. */
-	    result = 0;
-	}
-	else 
-	{
-	    /* Another error occured. Proceed. */
-	    result = 1;
-	}
+    if (errno == ENOENT)
+    {
+        /* There is no lock file. Proceed. */
+        result = 1;
+    }
+    else if (errno == EACCES)
+    {
+        /* There is a lock file. We cannot open it. */
+        result = 0;
+    }
+    else 
+    {
+        /* Another error occured. Proceed. */
+        result = 1;
+    }
     }
     else
     {
-	/* There is a lock file and we can open it. Check if we own it. */
-	stat(filepath, &buf);
+    /* There is a lock file and we can open it. Check if we own it. */
+    stat(filepath, &buf);
 
-	if ( buf.st_uid == getuid() )  /* Succeed, we own the lock */
-	{
-	    result = 1;
-	}
-	else  /* we are not the owner */
-	{
-	    result = 0;
-	}
+    if ( buf.st_uid == getuid() )  /* Succeed, we own the lock */
+    {
+        result = 1;
+    }
+    else  /* we are not the owner */
+    {
+        result = 0;
+    }
     }
 
     if (lock_handle)
     {
-	close(lock_handle);
+    close(lock_handle);
     }
 
     return result;
diff --git a/src/includes/memsweep.h b/src/includes/memsweep.h
index 3dfa486..e29d4d8 100644
--- a/src/includes/memsweep.h
+++ b/src/includes/memsweep.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File memsweep Module. 
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -34,9 +34,9 @@
 #include <types.h>
 
 extern void memsweep_setMemoryFraction(uint64_t fraction);
-extern void memsweep_node(void);
-extern void memsweep_domain(int domainId);
-extern void memsweep_threadGroup(int* processorList, int numberOfProcessors);
+extern void memsweep_node(FILE* OUTSTREAM);
+extern void memsweep_domain(FILE* OUTSTREAM, int domainId);
+extern void memsweep_threadGroup(FILE* OUTSTREAM, int* processorList, int numberOfProcessors);
 
 #endif /* MEMSWEEP_H */
 
diff --git a/src/includes/msr.h b/src/includes/msr.h
index a9ab911..45f8069 100644
--- a/src/includes/msr.h
+++ b/src/includes/msr.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File msr Module. 
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/multiplex.h b/src/includes/multiplex.h
index d40d0eb..c34cac8 100644
--- a/src/includes/multiplex.h
+++ b/src/includes/multiplex.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File multiplex Module. 
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/multiplex_types.h b/src/includes/multiplex_types.h
index 492d61c..8578a8f 100644
--- a/src/includes/multiplex_types.h
+++ b/src/includes/multiplex_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for multiplex  module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/numa.h b/src/includes/numa.h
index ab253fc..3a2d0f1 100644
--- a/src/includes/numa.h
+++ b/src/includes/numa.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File numa Module. 
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/numa_types.h b/src/includes/numa_types.h
index c065d50..bd4afda 100644
--- a/src/includes/numa_types.h
+++ b/src/includes/numa_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for numa module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -37,10 +37,10 @@ typedef struct {
     uint64_t totalMemory;
     uint64_t freeMemory;
     int numberOfProcessors;
-    uint32_t*  processors;
-    uint32_t*  processorsCompact;
+    uint32_t* processors;
+    uint32_t* processorsCompact;
     int numberOfDistances;
-    uint32_t*  distances;
+    uint32_t* distances;
 } NumaNode;
 
 typedef struct {
diff --git a/src/includes/pci.h b/src/includes/pci.h
index 62aa69b..1672f1c 100644
--- a/src/includes/pci.h
+++ b/src/includes/pci.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File pci Module. 
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/pci_types.h b/src/includes/pci_types.h
index 2a542aa..cfb9657 100644
--- a/src/includes/pci_types.h
+++ b/src/includes/pci_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for pci module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon.h b/src/includes/perfmon.h
index 96653d1..6e9d9f9 100644
--- a/src/includes/perfmon.h
+++ b/src/includes/perfmon.h
@@ -7,8 +7,8 @@
  *                    Configures and reads out performance counters
  *                    on x86 based architectures. Supports multi threading.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -41,8 +41,7 @@ extern int perfmon_verbose;
 extern void (*perfmon_startCountersThread) (int thread_id);
 extern void (*perfmon_stopCountersThread) (int thread_id);
 extern int  (*perfmon_getIndex) (bstring reg, PerfmonCounterIndex* index);
-extern void (*perfmon_setupCounterThread) (int thread_id,
-       PerfmonEvent* event , PerfmonCounterIndex index);
+extern void (*perfmon_setupCounterThread) (int thread_id, PerfmonEvent* event , PerfmonCounterIndex index);
 
 extern void perfmon_initEventSet(StrUtilEventSet* eventSetConfig, PerfmonEventSet* set);
 extern void perfmon_setCSVMode(int v);
@@ -53,6 +52,46 @@ extern void perfmon_finalize(void);
 extern void perfmon_setupEventSet(bstring eventString, BitMask* mask);
 extern double perfmon_getEventResult(int thread, int index);
 extern int perfmon_setupEventSetC(char* eventCString, const char*** eventnames);
+
+
+/*
+The following structure and set of functions provide an efficient and easy interface to
+access counters from different groups and switch between them.
+
+TODO: The internals need some cleanup, but the interface should remain rather stable.
+
+Usage:
+setup = perfmon_prepareEventSetup("VIEW"), etc..
+Whenever you want to use one of the prepared setups call:
+perfmon_setupCountersForEventSet(setup)
+
+then you can startCounters, stopCounters and then
+perfmon_getEventCounterValues() and/or
+perfmon_getDerivedCounterValues()
+ */
+typedef struct {
+    const char* groupName;
+    int numberOfEvents;
+    const char** eventNames;
+    int numberOfDerivedCounters;
+    const char** derivedNames;    
+
+    // Internal structures DO NOT ACCESS THEM, they need cleanup.
+    StrUtilEventSet* eventSetConfig;
+    PerfmonEventSet* perfmon_set;
+    PerfmonGroup groupSet;
+    int groupIndex;
+} EventSetup;
+
+
+extern EventSetup perfmon_prepareEventSetup(char* eventGroupString);
+extern void perfmon_setupCountersForEventSet(EventSetup * setup);
+
+// obtain values for all cores, average, min and max for the cores.
+extern void perfmon_getEventCounterValues(uint64_t* avg_values, uint64_t* max, uint64_t* min);
+extern void perfmon_getDerivedCounterValues(float* avg_values, float* max, float* min);
+/////////////////////////
+
 extern void perfmon_setupCounters(void);
 extern void perfmon_startCounters(void);
 extern void perfmon_stopCounters(void);
@@ -61,6 +100,8 @@ extern double perfmon_getResult(int threadId, char* counterString);
 extern void perfmon_printMarkerResults(bstring filepath);
 extern void perfmon_logCounterResults(double time);
 extern void perfmon_printCounterResults(void);
+
+
 extern void perfmon_printCounters(void);
 extern void perfmon_printEvents(void);
 
diff --git a/src/includes/perfmon_atom.h b/src/includes/perfmon_atom.h
index 5477742..201cea6 100644
--- a/src/includes/perfmon_atom.h
+++ b/src/includes/perfmon_atom.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header file of perfmon module for Atom
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_atom_events.txt b/src/includes/perfmon_atom_events.txt
index ec92314..4ca18e4 100644
--- a/src/includes/perfmon_atom_events.txt
+++ b/src/includes/perfmon_atom_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for Intel Atom
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_core2.h b/src/includes/perfmon_core2.h
index 193f630..f737dda 100644
--- a/src/includes/perfmon_core2.h
+++ b/src/includes/perfmon_core2.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header file of perfmon module for Core 2
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -73,12 +73,12 @@ void perfmon_setupCounterThread_core2(
     uint64_t flags;
     uint64_t reg = core2_counter_map[index].configRegister;
     int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
+    perfmon_threadData[thread_id].counters[index].init = TRUE;
 
     if ( core2_counter_map[index].type == PMC )
     {
-        perfmon_threadData[thread_id].counters[index].init = TRUE;
-        flags = msr_read(cpu_id,reg);
-        flags &= ~(0xFFFFU); 
+        flags = (1<<16)|(1<<19)|(1<<22);
 
         /* Intel with standard 8 bit event mask: [7:0] */
         flags |= (event->umask<<8) + event->eventId;
@@ -101,7 +101,8 @@ void perfmon_setupCounterThread_core2(
     }
     else if (core2_counter_map[index].type == FIXED)
     {
-        perfmon_threadData[thread_id].counters[index].init = TRUE;
+        fixed_flags |= (0x2 << (index*4));
+        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
     }
 }
 
diff --git a/src/includes/perfmon_core2_counters.h b/src/includes/perfmon_core2_counters.h
index cbade24..d6c33fb 100644
--- a/src/includes/perfmon_core2_counters.h
+++ b/src/includes/perfmon_core2_counters.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Counter header file of perfmon module for Core 2
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -28,15 +28,16 @@
  * =======================================================================================
  */
 
-#define NUM_COUNTERS_CORE2 4
-#define NUM_COUNTERS_CORE_CORE2 4
+#define NUM_COUNTERS_CORE2 5
+#define NUM_COUNTERS_CORE_CORE2 5
 
 static PerfmonCounterMap core2_counter_map[NUM_COUNTERS_CORE2] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
     {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
     {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
     /* PMC Counters: 2 40bit wide */
-    {"PMC0", PMC2, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC3, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0}
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0}
 };
 
diff --git a/src/includes/perfmon_core2_events.txt b/src/includes/perfmon_core2_events.txt
index f8dc59f..60c6211 100644
--- a/src/includes/perfmon_core2_events.txt
+++ b/src/includes/perfmon_core2_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for Intel Core 2
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
@@ -29,9 +29,12 @@
 EVENT_INSTR_RETIRED              0x00   FIXC0
 UMASK_INSTR_RETIRED_ANY          0x00
 
-EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
+EVENT_CPU_CLK_UNHALTED_CORE      0x00   FIXC1
 UMASK_CPU_CLK_UNHALTED_CORE      0x00
 
+EVENT_CPU_CLK_UNHALTED_REF       0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF       0x00
+
 EVENT_LOAD_BLOCK                 0x03   PMC
 UMASK_LOAD_BLOCK_STA             0x02
 UMASK_LOAD_BLOCK_STD             0x04
diff --git a/src/includes/perfmon_haswell.h b/src/includes/perfmon_haswell.h
index 0352476..57f12af 100644
--- a/src/includes/perfmon_haswell.h
+++ b/src/includes/perfmon_haswell.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File of perfmon module for Haswell.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -61,39 +61,78 @@ void perfmon_init_haswell(PerfmonThread *thread)
     msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
     msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
 
-    /* initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted core
-     * FIXED 2: Clocks unhalted ref */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
-    /* Preinit of PERFEVSEL registers */
-    flags |= (1<<22);  /* enable flag */
-    flags |= (1<<16);  /* user mode flag */
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    if (cpuid_info.model != HASWELL_EX && cpuid_info.supportUncore)
+    {
+        msr_write(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL0, 0xAA);
+        flags = msr_read(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL0);
+        if (flags != 0xAA)
+        {
+            fprintf(stdout, "The current system does not support Uncore MSRs, deactivating Uncore support\n");
+            cpuid_info.supportUncore = 0;
+        }
+    }
 
-    msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
+    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) && (cpuid_info.supportUncore))
     {
-
+        flags = 0x0ULL;
+        flags = (1ULL<<22)|(1ULL<<20);
+        msr_write(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL0, flags);
+        msr_write(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL1, flags);
+        msr_write(cpu_id, MSR_UNC_CBO_1_PERFEVTSEL0, flags);
+        msr_write(cpu_id, MSR_UNC_CBO_1_PERFEVTSEL1, flags);
+        msr_write(cpu_id, MSR_UNC_CBO_2_PERFEVTSEL0, flags);
+        msr_write(cpu_id, MSR_UNC_CBO_2_PERFEVTSEL1, flags);
+        msr_write(cpu_id, MSR_UNC_CBO_3_PERFEVTSEL0, flags);
+        msr_write(cpu_id, MSR_UNC_CBO_3_PERFEVTSEL1, flags);
+
+        msr_write(cpu_id, MSR_UNC_ARB_PERFEVTSEL0, flags);
+        msr_write(cpu_id, MSR_UNC_ARB_PERFEVTSEL1, flags);
+
+        msr_write(cpu_id, MSR_UNC_PERF_FIXED_CTRL, flags);
+
+        msr_write(cpu_id, MSR_UNC_CBO_0_CTR0, 0x0ULL);
+        msr_write(cpu_id, MSR_UNC_CBO_0_CTR1, 0x0ULL);
+        msr_write(cpu_id, MSR_UNC_CBO_1_CTR0, 0x0ULL);
+        msr_write(cpu_id, MSR_UNC_CBO_1_CTR1, 0x0ULL);
+        msr_write(cpu_id, MSR_UNC_CBO_2_CTR0, 0x0ULL);
+        msr_write(cpu_id, MSR_UNC_CBO_2_CTR1, 0x0ULL);
+        msr_write(cpu_id, MSR_UNC_CBO_3_CTR0, 0x0ULL);
+        msr_write(cpu_id, MSR_UNC_CBO_3_CTR1, 0x0ULL);
+
+        msr_write(cpu_id, MSR_UNC_ARB_CTR0, 0x0ULL);
+        msr_write(cpu_id, MSR_UNC_ARB_CTR1, 0x0ULL);
+
+        msr_write(cpu_id, MSR_UNC_PERF_FIXED_CTR, 0x0ULL);
     }
-
 }
 
+#define HAS_SETUP_BOX \
+    if (haveLock) \
+    { \
+        flags = (1ULL<<22)|(1ULL<<20); \
+        flags |= (event->umask<<8) + event->eventId; \
+        if (event->cfgBits != 0) /* set custom cfg and cmask */ \
+        { \
+            flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */ \
+            flags |= ((event->cmask<<8) + event->cfgBits)<<16; \
+        } \
+        msr_write(cpu_id, reg , flags); \
+    }
+
 void perfmon_setupCounterThread_haswell(
         int thread_id,
         PerfmonEvent* event,
         PerfmonCounterIndex index)
 {
     int haveLock = 0;
-    uint64_t flags;
+    uint64_t flags = 0x0ULL;
     uint32_t uflags;
     uint64_t reg = haswell_counter_map[index].configRegister;
     int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
+    uint64_t orig_fixed_flags = fixed_flags;
     perfmon_threadData[thread_id].counters[index].init = TRUE;
 
     if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
@@ -105,8 +144,7 @@ void perfmon_setupCounterThread_haswell(
     {
         case PMC:
 
-            flags = msr_read(cpu_id,reg);
-            flags &= ~(0xFFFFU);   /* clear lower 16bits */
+            flags = (1<<22)|(1<<16);
 
             /* Intel with standard 8 bit event mask: [7:0] */
             flags |= (event->umask<<8) + event->eventId;
@@ -124,20 +162,35 @@ void perfmon_setupCounterThread_haswell(
                         LLU_CAST reg,
                         LLU_CAST flags);
             }
-
             msr_write(cpu_id, reg , flags);
             break;
 
         case FIXED:
+            fixed_flags |= (0x2 << (index*4));
             break;
 
         case POWER:
             break;
 
+        case CBOX0:
+        case CBOX1:
+        case CBOX2:
+        case CBOX3:
+        case UBOX:
+            if (cpuid_info.supportUncore)
+            {
+                HAS_SETUP_BOX;
+            }
+            break;
+
         default:
             /* should never be reached */
             break;
     }
+    if (fixed_flags != orig_fixed_flags)
+    {
+        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+    }
 }
 
 void perfmon_startCountersThread_haswell(int thread_id)
@@ -146,6 +199,7 @@ void perfmon_startCountersThread_haswell(int thread_id)
     uint64_t flags = 0x0ULL;
     uint32_t uflags = 0x10000UL; /* Clear freeze bit */
     int cpu_id = perfmon_threadData[thread_id].processorId;
+    int start_uncore = 0;
 
     if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
     {
@@ -176,7 +230,14 @@ void perfmon_startCountersThread_haswell(int thread_id)
                         perfmon_threadData[thread_id].counters[i].counterData =
                             power_read(cpu_id, haswell_counter_map[i].counterRegister);
                     }
+                    break;
 
+                case CBOX0:
+                case CBOX1:
+                case CBOX2:
+                case CBOX3:
+                case UBOX:
+                    start_uncore = 1;
                     break;
 
                 default:
@@ -186,6 +247,11 @@ void perfmon_startCountersThread_haswell(int thread_id)
         }
     }
 
+    if (haveLock && start_uncore && cpuid_info.supportUncore)
+    {
+        msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, (1ULL<<29));
+    }
+
     if (perfmon_verbose)
     {
         printf("perfmon_start_counters: Write Register 0x%X , \
@@ -193,7 +259,6 @@ void perfmon_startCountersThread_haswell(int thread_id)
         printf("perfmon_start_counters: Write Register 0x%X , \
                 Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
     }
-
     msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
     msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
 }
@@ -201,6 +266,7 @@ void perfmon_startCountersThread_haswell(int thread_id)
 void perfmon_stopCountersThread_haswell(int thread_id)
 {
     uint64_t flags;
+    uint64_t tmp;
     uint32_t uflags = 0x10100UL; /* Set freeze bit */
     uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
@@ -212,6 +278,10 @@ void perfmon_stopCountersThread_haswell(int thread_id)
     }
 
     msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (haveLock && cpuid_info.supportUncore)
+    {
+        msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, 0x0ULL);
+    }
 
     for ( int i=0; i < perfmon_numCountersHaswell; i++ ) 
     {
@@ -241,6 +311,18 @@ void perfmon_stopCountersThread_haswell(int thread_id)
                              thermal_read(cpu_id);
                     break;
 
+                case CBOX0:
+                case CBOX1:
+                case CBOX2:
+                case CBOX3:
+                case UBOX:
+                    if(haveLock && cpuid_info.supportUncore)
+                    {
+                        perfmon_threadData[thread_id].counters[i].counterData =
+                            msr_read(cpu_id, haswell_counter_map[i].counterRegister);
+                    }
+                    break;
+
                 default:
                     /* should never be reached */
                     break;
@@ -261,12 +343,22 @@ void perfmon_readCountersThread_haswell(int thread_id)
     uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
     int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t core_flags = 0x0ULL;
+    uint64_t uncore_flags = 0x0ULL;
 
     if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
     {
         haveLock = 1;
     }
 
+    core_flags = msr_read(cpu_id, MSR_PERF_GLOBAL_CTRL);
+    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (cpuid_info.supportUncore)
+    {
+        uncore_flags = msr_read(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL);
+        msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, 0x0ULL);
+    }
+
     for ( int i=0; i<perfmon_numCountersHaswell; i++ )
     {
         if (perfmon_threadData[thread_id].counters[i].init == TRUE)
@@ -289,6 +381,17 @@ void perfmon_readCountersThread_haswell(int thread_id)
                                 power_read(cpu_id, haswell_counter_map[i].counterRegister);
                             break;
 
+                        case CBOX0:
+                        case CBOX1:
+                        case CBOX2:
+                        case CBOX3:
+                        case UBOX:
+                            if(haveLock)
+                            {
+                                perfmon_threadData[thread_id].counters[i].counterData =
+                                    msr_read(cpu_id, haswell_counter_map[i].counterRegister);
+                            }
+                            break;
                         default:
                             /* should never be reached */
                             break;
@@ -297,5 +400,10 @@ void perfmon_readCountersThread_haswell(int thread_id)
             }
         }
     }
+    if (cpuid_info.supportUncore && uncore_flags > 0x0ULL)
+    {
+        msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, uncore_flags);
+    }
+    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, core_flags);
 }
 
diff --git a/src/includes/perfmon_haswell_counters.h b/src/includes/perfmon_haswell_counters.h
index 4302efe..3dc7247 100644
--- a/src/includes/perfmon_haswell_counters.h
+++ b/src/includes/perfmon_haswell_counters.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Counter Header File of perfmon module for Haswell.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -29,7 +29,8 @@
  */
 
 #define NUM_COUNTERS_HASWELL 12
-#define NUM_COUNTERS_CORE_HASWELL 7
+#define NUM_COUNTERS_UNCORE_HASWELL 4
+#define NUM_COUNTERS_CORE_HASWELL 8
 
 static PerfmonCounterMap haswell_counter_map[NUM_COUNTERS_HASWELL] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
@@ -45,8 +46,8 @@ static PerfmonCounterMap haswell_counter_map[NUM_COUNTERS_HASWELL] = {
     {"TMP0", PMC7, THERMAL, 0, 0, 0, 0},
     /* RAPL counters */
     {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR1", PMC9, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR2", PMC10, POWER, 0, MSR_PKG_ENERGY_STATUS,  0, 0},
-    {"PWR3", PMC11, POWER, 0, MSR_PKG_ENERGY_STATUS,  0, 0},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0},
 };
 
diff --git a/src/includes/perfmon_haswell_events.txt b/src/includes/perfmon_haswell_events.txt
index bb9d56d..f958a3a 100644
--- a/src/includes/perfmon_haswell_events.txt
+++ b/src/includes/perfmon_haswell_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for Intel Ivy Bridge
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
@@ -35,6 +35,9 @@ UMASK_PWR_PKG_ENERGY          0x00
 EVENT_PWR_PP0_ENERGY          0x00   PWR1
 UMASK_PWR_PP0_ENERGY          0x00
 
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
 EVENT_PWR_DRAM_ENERGY          0x00   PWR3
 UMASK_PWR_DRAM_ENERGY          0x00
 
@@ -49,6 +52,7 @@ UMASK_CPU_CLK_UNHALTED_REF       0x00
 
 EVENT_LD_BLOCKS                 0x03  PMC
 UMASK_LD_BLOCKS_STORE_FORWARD   0x02
+UMASK_LD_BLOCKS_NO_SR           0x08
 
 EVENT_MISALIGN_MEM_REF           0x05  PMC
 UMASK_MISALIGN_MEM_REF_LOADS      0x01
@@ -56,7 +60,7 @@ UMASK_MISALIGN_MEM_REF_STORES     0x02
 UMASK_MISALIGN_MEM_REF_ANY        0x03
 
 EVENT_LD_BLOCKS_PARTIAL      0x07  PMC
-UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01
 
 EVENT_DTLB_LOAD_MISSES                 0x08  PMC
 UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK         0x01
@@ -69,8 +73,8 @@ UMASK_DTLB_LOAD_MISSES_STLB_HIT_2M           0x40
 UMASK_DTLB_LOAD_MISSES_STLB_HIT              0x60
 UMASK_DTLB_LOAD_MISSES_PDE_CACHE_MISS        0x80
 
-EVENT_RECOVERY_CYCLES            0x0D  PMC
-UMASK_UOPS_ISSUED_ANY            0x03
+EVENT_INT_MISC            0x0D  PMC
+UMASK_INT_MISC_RECOVERY_CYCLES  0x03 0x01
 
 EVENT_UOPS_ISSUED                0x0E  PMC
 UMASK_UOPS_ISSUED_ANY            0x01
@@ -84,7 +88,7 @@ UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_HIT 0x41
 UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD     0xE1
 UMASK_L2_RQSTS_RFO_HIT           0x42
 UMASK_L2_RQSTS_RFO_MISS          0x22
-UMASK_L2_RQSTS_RFO_ANY           0xE2
+UMASK_L2_RQSTS_ALL_RFO           0xE2
 UMASK_L2_RQSTS_CODE_RD_HIT        0x44
 UMASK_L2_RQSTS_CODE_RD_MISS       0x24
 UMASK_L2_RQSTS_ALL_DEMAND_MISS   0x27
@@ -105,7 +109,7 @@ UMASK_LONGEST_LAT_CACHE_MISS          0x41
 
 EVENT_CPU_CLOCK_UNHALTED         0x3C   PMC
 UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
-UMASK_CPU_CLOCK_UNHALTED_REF_P     0x01
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK     0x01
 
 EVENT_L1D_PEND_MISS              0x48   PMC1
 UMASK_L1D_PEND_MISS_PENDING      0x01
@@ -127,6 +131,9 @@ UMASK_LOAD_HIT_PRE_HW_PF               0x02
 
 EVENT_L1D                        0x51   PMC
 UMASK_L1D_REPLACEMENT             0x01
+UMASK_L1D_ALLOCATED_IN_M          0x02
+UMASK_L1D_M_EVICT                 0x04
+UMASK_L1D_ALL_M_REPLACEMENT       0x08
 
 EVENT_MOVE_ELIMINATION                        0x58   PMC
 UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED     0x04
@@ -154,18 +161,27 @@ UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION       0x02
 EVENT_IDQ               0x79   PMC
 UMASK_IDQ_EMPTY         0x02
 UMASK_IDQ_MITE_UOPS     0x04
+UMASK_IDQ_MITE_UOPS_CYCLES  0x04 0x00 0x01
 UMASK_IDQ_DSB_UOPS      0x08
+UMASK_IDQ_DSB_UOPS_CYCLES  0x08 0x00 0x01
 UMASK_IDQ_MS_DSB_UOPS   0x10
+UMASK_IDQ_MS_DSB_UOPS_CYCLES  0x10 0x00 0x01
 UMASK_IDQ_MS_MITE_UOPS  0x20
+UMASK_IDQ_MS_MITE_UOPS_CYCLES  0x20 0x00 0x01
 UMASK_IDQ_MS_UOPS       0x30
+UMASK_IDQ_MS_UOPS_CYCLES  0x30 0x00 0x01
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18 0x00 0x01
 UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18 0x00 0x01
 UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS        0x18 0x00 0x04
-UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24 0x00 0x01
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS       0x24 0x00 0x01
 UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24 0x00 0x04
-UMASK_IDQ_ALL_MITE_ALL_UOPS       0x3C
+UMASK_IDQ_MITE_ALL_UOPS       0x3C
 
 EVENT_ICACHE                  0x80   PMC
+UMASK_ICACHE_HITS             0x01
 UMASK_ICACHE_MISSES             0x02
+UMASK_ICACHE_ACCESSES           0x03
+UMASK_ICACHE_IFETCH_STALL       0x04
 
 EVENT_ITLB_MISSES                 0x85      PMC
 UMASK_ITLB_MISSES_CAUSES_A_WALK   0x01
@@ -185,28 +201,19 @@ EVENT_BR_INST_EXEC                                      0x88   PMC
 UMASK_BR_INST_EXEC_COND_TAKEN                           0x81
 UMASK_BR_INST_EXEC_COND_NON_TAKEN                       0x41
 UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                     0x82
-UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN                 0x42
 UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
-UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
 UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                    0x88
-UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN                0x48
 UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
-UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0 
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60 
-UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF 
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF
 
 EVENT_BR_MISP_EXEC                                      0x89   PMC
 UMASK_BR_MISP_EXEC_COND_TAKEN                           0x81
 UMASK_BR_MISP_EXEC_COND_NON_TAKEN                       0x41
 UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
-UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
 UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                    0x88
-UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN                0x48
 UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
-UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
 UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
-UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
 UMASK_BR_MISP_EXEC_ALL_BRANCHES                         0xFF
 
 EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
@@ -230,9 +237,18 @@ UMASK_RESOURCE_STALLS_ROB             0x10
 
 EVENT_CYCLE_ACTIVITY                 0xA3   PMC
 UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING             0x01
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING_CYCLES      0x01 0x00 0x02
 UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING              0x02
-UMASK_CYCLE_ACTIVITY_STALL_L2_PENDING            0x05
-UMASK_CYCLE_ACTIVITY_L1D_PENDING               0x08
+UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING_CYCLES      0x01 0x00 0x02
+UMASK_CYCLE_ACTIVITY_STALLS_L2_PENDING            0x05
+
+EVENT_CYCLE_ACTIVITY_CYCLES                 0xA3   PMC2
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_PENDING               0x08
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_PENDING_CYCLES        0x08 0x00 0x08
+
+EVENT_CYCLE_ACTIVITY_STALLS                 0xA3   PMC2
+UMASK_CYCLE_ACTIVITY_STALLS_L1D_PENDING               0x0C
+UMASK_CYCLE_ACTIVITY_STALLS_L1D_PENDING_CYCLES        0x0C 0x00 0x0C
 
 EVENT_LSD_UOPS                 0xA8   PMC
 UMASK_LSD_UOPS             0x01
@@ -265,7 +281,6 @@ UMASK_TLB_FLUSH_STLB_ANY        0x20
 
 EVENT_INST_RETIRED                  0xC0  PMC1
 UMASK_INST_RETIRED_ANY_P            0x00
-UMASK_INST_RETIRED_ALL              0x01
 
 EVENT_OTHER_ASSISTS                  0xC1  PMC
 UMASK_OTHER_ASSISTS_AVX_TO_SSE            0x08
@@ -292,11 +307,11 @@ UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
 UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
 
 EVENT_BR_MISP_RETIRED               0xC5  PMC
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES_1  0x00
 UMASK_BR_MISP_RETIRED_CONDITIONAL  0x01
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES     0x04
-UMASK_BR_MISP_RETIRED_NOT_TAKEN      0x10
-UMASK_BR_MISP_RETIRED_TAKEN      0x20
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES_2     0x04
+UMASK_BR_MISP_RETIRED_NEAR_NOT_TAKEN      0x10
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN      0x20
 
 EVENT_FP_ASSIST               0xCA  PMC
 UMASK_FP_ASSIST_X87_OUTPUT               0x02
@@ -309,7 +324,7 @@ EVENT_ROB_MISC_EVENT_LBR_INSERTS               0xCC  PMC
 UMASK_ROB_MISC_EVENT_LBR_INSERTS               0x20
 
 EVENT_MEM_TRANS_RETIRED_LOAD_LAT               0xCD  PMC
-UMASK_MEM_TRANS_RETIRED_LOAD_LAT               0x01
+UMASK_MEM_TRANS_RETIRED_LOAD_LATENCY           0x01
 
 EVENT_MEM_UOP_RETIRED            0xD0    PMC
 UMASK_MEM_UOP_RETIRED_LOADS            0x81
@@ -321,18 +336,23 @@ UMASK_MEM_UOP_RETIRED_STORES_LOCK             0x22
 UMASK_MEM_UOP_RETIRED_LOADS_SPLIT             0x41
 UMASK_MEM_UOP_RETIRED_STORES_SPLIT            0x42
 
-EVENT_MEMLOAD_UOPS_RETIRED               0xD1   PMC
-UMASK_MEMLOAD_UOPS_RETIRED_L1_HIT       0x01
-UMASK_MEMLOAD_UOPS_RETIRED_L2_HIT       0x02
-UMASK_MEMLOAD_UOPS_RETIRED_LLC_HIT      0x04
-UMASK_MEMLOAD_UOPS_RETIRED_L2_MISS      0x10
-UMASK_MEMLOAD_UOPS_RETIRED_HIT_LFB      0x40
-
-EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED               0xD2   PMC
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS         0x01
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT          0x02
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM         0x04
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE         0x08
+EVENT_MEM_LOAD_UOPS_RETIRED               0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
+
+EVENT_MEM_LOAD_UOPS_L3_HIT_RETIRED               0xD2   PMC
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_MISS         0x01
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HIT          0x02
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM         0x04
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_NONE         0x08
 
 EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED               0xD3   PMC
 UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM      0x01
@@ -374,6 +394,8 @@ UMASK_TX_EXEC_MISC1     0x01
 UMASK_TX_EXEC_MISC2     0x02
 UMASK_TX_EXEC_MISC3     0x04
 UMASK_TX_EXEC_MISC4     0x08
+UMASK_TX_EXEC_MISC5     0x10
+
 
 EVENT_HLE_RETIRED                  0xC8   PMC
 UMASK_HLE_RETIRED_START            0x01
@@ -394,49 +416,3 @@ UMASK_RTM_RETIRED_ABORTED_MISC2     0x10
 UMASK_RTM_RETIRED_ABORTED_MISC3     0x20
 UMASK_RTM_RETIRED_ABORTED_MISC4     0x40
 UMASK_RTM_RETIRED_ABORTED_MISC5     0x80
-
-EVENT_UNC_CBO_XSNP_RESPONSE             0x22   UPMC
-UMASK_UNC_CBO_XSNP_RESPONSE_EXT_FILTER_MISS        0x21
-UMASK_UNC_CBO_XSNP_RESPONSE_XCORE_FILTER_MISS      0x41
-UMASK_UNC_CBO_XSNP_RESPONSE_EVICTION_FILTER_MISS   0x81
-UMASK_UNC_CBO_XSNP_RESPONSE_EXT_FILTER_INVAL        0x22
-UMASK_UNC_CBO_XSNP_RESPONSE_XCORE_FILTER_INVAL      0x42
-UMASK_UNC_CBO_XSNP_RESPONSE_EVICTION_FILTER_INVAL   0x82
-UMASK_UNC_CBO_XSNP_RESPONSE_EXT_FILTER_HIT        0x24
-UMASK_UNC_CBO_XSNP_RESPONSE_XCORE_FILTER_HIT      0x44
-UMASK_UNC_CBO_XSNP_RESPONSE_EVICTION_FILTER_HIT   0x84
-UMASK_UNC_CBO_XSNP_RESPONSE_EXT_FILTER_HITM        0x28
-UMASK_UNC_CBO_XSNP_RESPONSE_XCORE_FILTER_HITM      0x48
-UMASK_UNC_CBO_XSNP_RESPONSE_EVICTION_FILTER_HITM   0x88
-UMASK_UNC_CBO_XSNP_RESPONSE_EXT_FILTER_INVAL_M        0x20
-UMASK_UNC_CBO_XSNP_RESPONSE_XCORE_FILTER_INVAL_M      0x50
-UMASK_UNC_CBO_XSNP_RESPONSE_EVICTION_FILTER_INVAL_M   0x90
-
-EVENT_UNC_CBO_CACHE_LOOKUP             0x34   UPMC
-UMASK_UNC_CBO_CACHE_LOOKUP_READ_FILTER_M        0x11
-UMASK_UNC_CBO_CACHE_LOOKUP_WRITE_FILTER_M       0x26
-UMASK_UNC_CBO_CACHE_LOOKUP_EXTSNP_FILTER_M      0x48
-UMASK_UNC_CBO_CACHE_LOOKUP_ANY_FILTER_M         0x81
-UMASK_UNC_CBO_CACHE_LOOKUP_READ_FILTER_ES       0x16
-UMASK_UNC_CBO_CACHE_LOOKUP_WRITE_FILTER_ES      0x28
-UMASK_UNC_CBO_CACHE_LOOKUP_EXTSNP_FILTER_ES     0x41
-UMASK_UNC_CBO_CACHE_LOOKUP_ANY_FILTER_ES        0x86
-UMASK_UNC_CBO_CACHE_LOOKUP_READ_FILTER_I        0x18
-UMASK_UNC_CBO_CACHE_LOOKUP_WRITE_FILTER_I       0x21
-UMASK_UNC_CBO_CACHE_LOOKUP_EXTSNP_FILTER_I      0x46
-UMASK_UNC_CBO_CACHE_LOOKUP_ANY_FILTER_I         0x88
-
-EVENT_UNC_ARB_TRK_OCCUPANCY_ALL        0x80       UPMC
-UMASK_UNC_ARB_TRK_OCCUPANCY_ALL        0x01
-
-EVENT_UNC_ARB_TRK_REQUEST              0x81       UPMC
-UMASK_UNC_ARB_TRK_REQUEST_ALL          0x01
-UMASK_UNC_ARB_TRK_REQUEST_WRITES       0x20
-UMASK_UNC_ARB_TRK_REQUEST_EVICTIONS    0x80
-
-EVENT_UNC_ARB_COH_TRK_OCCUPANCY_ALL        0x83   UPMC
-UMASK_UNC_ARB_COH_TRK_OCCUPANCY_ALL        0x01
-
-EVENT_UNC_ARB_COH_TRK_REQUEST_ALL        0x84     UPMC
-UMASK_UNC_ARB_COH_TRK_REQUEST_ALL        0x01
-
diff --git a/src/includes/perfmon_interlagos.h b/src/includes/perfmon_interlagos.h
index b96b944..d28bb18 100644
--- a/src/includes/perfmon_interlagos.h
+++ b/src/includes/perfmon_interlagos.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header file of perfmon module for AMD Interlagos
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -60,13 +60,13 @@ void perfmon_init_interlagos(PerfmonThread *thread)
         msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL3, 0x0ULL);
     }
 
-    flags |= (1<<16);  /* user mode flag */
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL0, flags);
+    //flags |= (1<<16);  /* user mode flag */
+    /*msr_write(cpu_id, MSR_AMD15_PERFEVTSEL0, flags);
     msr_write(cpu_id, MSR_AMD15_PERFEVTSEL1, flags);
     msr_write(cpu_id, MSR_AMD15_PERFEVTSEL2, flags);
     msr_write(cpu_id, MSR_AMD15_PERFEVTSEL3, flags);
     msr_write(cpu_id, MSR_AMD15_PERFEVTSEL4, flags);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL5, flags);
+    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL5, flags);*/
 }
 
 
@@ -87,9 +87,7 @@ void perfmon_setupCounterThread_interlagos(
         return;
     }
 
-    flags = msr_read(cpu_id,reg);
-    flags &= ~(0xFFFFU); 
-
+    flags = (1<<16);
     /* AMD uses a 12 bit Event mask: [35:32][7:0] */
     flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
 
diff --git a/src/includes/perfmon_interlagos_counters.h b/src/includes/perfmon_interlagos_counters.h
index 136d0f7..a593f5a 100644
--- a/src/includes/perfmon_interlagos_counters.h
+++ b/src/includes/perfmon_interlagos_counters.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Counter Header File of perfmon module for AMD Interlagos
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_interlagos_events.txt b/src/includes/perfmon_interlagos_events.txt
index 16f0a9b..1fa0a44 100644
--- a/src/includes/perfmon_interlagos_events.txt
+++ b/src/includes/perfmon_interlagos_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for AMD Interlagos
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_ivybridge.h b/src/includes/perfmon_ivybridge.h
index 9de9f6d..0615c27 100644
--- a/src/includes/perfmon_ivybridge.h
+++ b/src/includes/perfmon_ivybridge.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File of perfmon module for Ivy Bridge.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -65,16 +65,16 @@ void perfmon_init_ivybridge(PerfmonThread *thread)
      * FIXED 0: Instructions retired
      * FIXED 1: Clocks unhalted core
      * FIXED 2: Clocks unhalted ref */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
+    //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
 
     /* Preinit of PERFEVSEL registers */
-    flags |= (1<<22);  /* enable flag */
-    flags |= (1<<16);  /* user mode flag */
+    //flags |= (1<<22);  /* enable flag */
+    //flags |= (1<<16);  /* user mode flag */
 
-    msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
+    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
     msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
     msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);
+    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
 
     /* TODO Robust implementation which also works if stuff is not there */
     if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
@@ -150,11 +150,12 @@ void perfmon_init_ivybridge(PerfmonThread *thread)
                 pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
                 pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
 
+#if 0
                 /* FIXME: Not yet tested/ working due to BIOS issues on test
                  * machines */
 
                 /* QPI registers can be zeroed with single write */
-                uflags = 0x0113UL; /*enable freeze (bit 16), freeze (bit 8), reset */
+                uflags = 0x0103UL; /* freeze (bit 8), reset */
                 pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
                 pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
                 uflags = 0x0UL;
@@ -168,7 +169,7 @@ void perfmon_init_ivybridge(PerfmonThread *thread)
                 pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_2, uflags);
                 pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_3, uflags);
 
-#if 0
+
                 /* Cbo counters */
                 uflags = 0xF0103UL; /*enable freeze (bit 8), reset */
                 msr_write(cpu_id, MSR_UNC_C0_PMON_BOX_CTL, uflags);
@@ -197,19 +198,22 @@ void perfmon_init_ivybridge(PerfmonThread *thread)
     }
 }
 
-#define BOX_GATE_SNB(channel,label) \
+#define BOX_GATE_IVB(channel,label) \
     if (perfmon_verbose) { \
-        printf("[%d] perfmon_setup_counter (label): Write Register 0x%llX , Flags: 0x%llX \n", \
+        printf("[%d] perfmon_setup_counter (##label): Write Register 0x%llX , Flags: 0x%llX \n", \
                 cpu_id, \
                 LLU_CAST reg, \
                 LLU_CAST flags); \
     } \
-if(haveLock) { \
-    uflags = pci_read(cpu_id, channel, reg);  \
-    uflags &= ~(0xFFFFU);  \
-    uflags |= (event->umask<<8) + event->eventId;  \
-    pci_write(cpu_id, channel,  reg, uflags);  \
-}
+    if(haveLock) { \
+        uflags = (1UL<<22);\
+        uflags |= (event->umask<<8) + event->eventId;  \
+        if (event->cfgBits == 0xFF) \
+        { \
+            uflags |= (1<<21); \
+        } \
+        pci_write(cpu_id, channel,  reg, uflags);  \
+    }
 
 
 void perfmon_setupCounterThread_ivybridge(
@@ -222,6 +226,8 @@ void perfmon_setupCounterThread_ivybridge(
     uint32_t uflags;
     uint64_t reg = ivybridge_counter_map[index].configRegister;
     int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
+    uint64_t orig_fixed_flags = fixed_flags;
     perfmon_threadData[thread_id].counters[index].init = TRUE;
 
     if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
@@ -233,9 +239,10 @@ void perfmon_setupCounterThread_ivybridge(
     {
         case PMC:
 
-            perfmon_threadData[thread_id].counters[index].init = TRUE;
-            flags = msr_read(cpu_id,reg);
-            flags &= ~(0xFFFFU);   /* clear lower 16bits */
+
+            //flags = msr_read(cpu_id,reg);
+            //flags &= ~(0xFFFFU);   /* clear lower 16bits */
+            flags = (1<<22)|(1<<16);
 
             /* Intel with standard 8 bit event mask: [7:0] */
             flags |= (event->umask<<8) + event->eventId;
@@ -258,25 +265,26 @@ void perfmon_setupCounterThread_ivybridge(
             break;
 
         case FIXED:
+            fixed_flags |= (0x2ULL<<(index*4));
             break;
 
         case POWER:
             break;
 
         case MBOX0:
-            BOX_GATE_SNB(PCI_IMC_DEVICE_CH_0,MBOX0);
+            BOX_GATE_IVB(PCI_IMC_DEVICE_CH_0,MBOX0);
             break;
 
         case MBOX1:
-            BOX_GATE_SNB(PCI_IMC_DEVICE_CH_1,MBOX1);
+            BOX_GATE_IVB(PCI_IMC_DEVICE_CH_1,MBOX1);
             break;
 
         case MBOX2:
-            BOX_GATE_SNB(PCI_IMC_DEVICE_CH_2,MBOX2);
+            BOX_GATE_IVB(PCI_IMC_DEVICE_CH_2,MBOX2);
             break;
 
         case MBOX3:
-            BOX_GATE_SNB(PCI_IMC_DEVICE_CH_3,MBOX3);
+            BOX_GATE_IVB(PCI_IMC_DEVICE_CH_3,MBOX3);
             break;
 
         case SBOX0:
@@ -286,8 +294,9 @@ void perfmon_setupCounterThread_ivybridge(
             {
                 if(haveLock)
                 {
-                    uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0, reg);
-                    uflags &= ~(0xFFFFU);
+                    //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0, reg);
+                    //uflags &= ~(0xFFFFU);
+                    uflags = (1UL<<22);
                     uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
                     printf("UFLAGS 0x%x \n",uflags);
                     pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  reg, uflags);
@@ -307,7 +316,7 @@ void perfmon_setupCounterThread_ivybridge(
             }
             else
             {
-                BOX_GATE_SNB(PCI_QPI_DEVICE_PORT_0,SBOX0);
+                BOX_GATE_IVB(PCI_QPI_DEVICE_PORT_0,SBOX0);
             }
 
             break;
@@ -319,8 +328,9 @@ void perfmon_setupCounterThread_ivybridge(
             {
                 if(haveLock)
                 {
-                    uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1, reg);
-                    uflags &= ~(0xFFFFU);
+                    //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1, reg);
+                    //uflags &= ~(0xFFFFU);
+                    uflags = (1UL<<22);
                     uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
                     pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  reg, uflags);
 
@@ -337,7 +347,7 @@ void perfmon_setupCounterThread_ivybridge(
             }
             else
             {
-                BOX_GATE_SNB(PCI_QPI_DEVICE_PORT_0,SBOX0);
+                BOX_GATE_IVB(PCI_QPI_DEVICE_PORT_0,SBOX0);
             }
             break;
 
@@ -379,6 +389,10 @@ void perfmon_setupCounterThread_ivybridge(
             /* should never be reached */
             break;
     }
+    if (fixed_flags != orig_fixed_flags)
+    {
+        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+    }
 }
 
 #define CBOX_START(NUM) \
@@ -454,14 +468,14 @@ void perfmon_startCountersThread_ivybridge(int thread_id)
                 case SBOX0:
                     if(haveLock)
                     {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
+                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, 0x0ULL);
                     }
                     break;
 
                 case SBOX1:
                     if(haveLock)
                     {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
+                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, 0x0ULL);
                     }
                     break;
 
@@ -549,7 +563,7 @@ if(haveLock) { \
 
 #define SBOX_STOP(NUM) \
 if(haveLock) { \
-    pci_write(cpu_id, PCI_QPI_DEVICE_PORT_##NUM ,  PCI_UNC_QPI_PMON_BOX_CTL, uflags); \
+    pci_write(cpu_id, PCI_QPI_DEVICE_PORT_##NUM ,  PCI_UNC_QPI_PMON_BOX_CTL, (1<<8)); \
     counter_result = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_##NUM , ivybridge_counter_map[i].counterRegister); \
     counter_result = (counter_result<<32) + pci_read(cpu_id, PCI_QPI_DEVICE_PORT_##NUM , ivybridge_counter_map[i].counterRegister2);  \
     perfmon_threadData[thread_id].counters[i].counterData = counter_result; \
@@ -713,13 +727,11 @@ void perfmon_readCountersThread_ivybridge(int thread_id)
     {
         haveLock = 1;
     }
-
     for ( int i=0; i<NUM_COUNTERS_IVYBRIDGE; i++ )
     {
         if (perfmon_threadData[thread_id].counters[i].init == TRUE)
         {
-            if ((ivybridge_counter_map[i].type == PMC) ||
-                    (ivybridge_counter_map[i].type == FIXED))
+            if ((ivybridge_counter_map[i].type == PMC) || (ivybridge_counter_map[i].type == FIXED))
             {
                 perfmon_threadData[thread_id].counters[i].counterData =
                     msr_read(cpu_id, ivybridge_counter_map[i].counterRegister);
diff --git a/src/includes/perfmon_ivybridge_counters.h b/src/includes/perfmon_ivybridge_counters.h
index d4fa25a..e63dfb0 100644
--- a/src/includes/perfmon_ivybridge_counters.h
+++ b/src/includes/perfmon_ivybridge_counters.h
@@ -5,8 +5,8 @@
  *
  *      Description: Counter header file of perfmon module for Ivy Bridge.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -29,8 +29,8 @@
  */
 
 #define NUM_COUNTERS_CORE_IVYBRIDGE 8
-#define NUM_COUNTERS_UNCORE_IVYBRIDGE 60
-#define NUM_COUNTERS_IVYBRIDGE 85
+#define NUM_COUNTERS_UNCORE_IVYBRIDGE 12
+#define NUM_COUNTERS_IVYBRIDGE 32
 
 static PerfmonCounterMap ivybridge_counter_map[NUM_COUNTERS_IVYBRIDGE] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
@@ -49,82 +49,27 @@ static PerfmonCounterMap ivybridge_counter_map[NUM_COUNTERS_IVYBRIDGE] = {
     {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
     {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0},
     {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0},
-    /* CBOX counters */
-    {"CBOX0C0", PMC12, CBOX0, MSR_UNC_C0_PMON_CTL0, MSR_UNC_C0_PMON_CTR0, 0, 0},
-    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_C0_PMON_CTL1, MSR_UNC_C0_PMON_CTR1, 0, 0},
-    {"CBOX0C2", PMC14, CBOX0, MSR_UNC_C0_PMON_CTL2, MSR_UNC_C0_PMON_CTR2, 0, 0},
-    {"CBOX0C3", PMC15, CBOX0, MSR_UNC_C0_PMON_CTL3, MSR_UNC_C0_PMON_CTR3, 0, 0},
-    {"CBOX1C0", PMC16, CBOX1, MSR_UNC_C1_PMON_CTL0, MSR_UNC_C1_PMON_CTR0, 0, 0},
-    {"CBOX1C1", PMC17, CBOX1, MSR_UNC_C1_PMON_CTL1, MSR_UNC_C1_PMON_CTR1, 0, 0},
-    {"CBOX1C2", PMC18, CBOX1, MSR_UNC_C1_PMON_CTL2, MSR_UNC_C1_PMON_CTR2, 0, 0},
-    {"CBOX1C3", PMC19, CBOX1, MSR_UNC_C1_PMON_CTL3, MSR_UNC_C1_PMON_CTR3, 0, 0},
-    {"CBOX2C0", PMC20, CBOX2, MSR_UNC_C2_PMON_CTL0, MSR_UNC_C2_PMON_CTR0, 0, 0},
-    {"CBOX2C1", PMC21, CBOX2, MSR_UNC_C2_PMON_CTL1, MSR_UNC_C2_PMON_CTR1, 0, 0},
-    {"CBOX2C2", PMC22, CBOX2, MSR_UNC_C2_PMON_CTL2, MSR_UNC_C2_PMON_CTR2, 0, 0},
-    {"CBOX2C3", PMC23, CBOX2, MSR_UNC_C2_PMON_CTL3, MSR_UNC_C2_PMON_CTR3, 0, 0},
-    {"CBOX3C0", PMC24, CBOX3, MSR_UNC_C3_PMON_CTL0, MSR_UNC_C3_PMON_CTR0, 0, 0},
-    {"CBOX3C1", PMC25, CBOX3, MSR_UNC_C3_PMON_CTL1, MSR_UNC_C3_PMON_CTR1, 0, 0},
-    {"CBOX3C2", PMC26, CBOX3, MSR_UNC_C3_PMON_CTL2, MSR_UNC_C3_PMON_CTR2, 0, 0},
-    {"CBOX3C3", PMC27, CBOX3, MSR_UNC_C3_PMON_CTL3, MSR_UNC_C3_PMON_CTR3, 0, 0},
-    {"CBOX4C0", PMC28, CBOX4, MSR_UNC_C4_PMON_CTL0, MSR_UNC_C4_PMON_CTR0, 0, 0},
-    {"CBOX4C1", PMC29, CBOX4, MSR_UNC_C4_PMON_CTL1, MSR_UNC_C4_PMON_CTR1, 0, 0},
-    {"CBOX4C2", PMC30, CBOX4, MSR_UNC_C4_PMON_CTL2, MSR_UNC_C4_PMON_CTR2, 0, 0},
-    {"CBOX4C3", PMC31, CBOX4, MSR_UNC_C4_PMON_CTL3, MSR_UNC_C4_PMON_CTR3, 0, 0},
-    {"CBOX5C0", PMC32, CBOX5, MSR_UNC_C5_PMON_CTL0, MSR_UNC_C5_PMON_CTR0, 0, 0},
-    {"CBOX5C1", PMC33, CBOX5, MSR_UNC_C5_PMON_CTL1, MSR_UNC_C5_PMON_CTR1, 0, 0},
-    {"CBOX5C2", PMC34, CBOX5, MSR_UNC_C5_PMON_CTL2, MSR_UNC_C5_PMON_CTR2, 0, 0},
-    {"CBOX5C3", PMC35, CBOX5, MSR_UNC_C5_PMON_CTL3, MSR_UNC_C5_PMON_CTR3, 0, 0},
-    {"CBOX6C0", PMC36, CBOX6, MSR_UNC_C6_PMON_CTL0, MSR_UNC_C6_PMON_CTR0, 0, 0},
-    {"CBOX6C1", PMC37, CBOX6, MSR_UNC_C6_PMON_CTL1, MSR_UNC_C6_PMON_CTR1, 0, 0},
-    {"CBOX6C2", PMC38, CBOX6, MSR_UNC_C6_PMON_CTL2, MSR_UNC_C6_PMON_CTR2, 0, 0},
-    {"CBOX6C3", PMC39, CBOX6, MSR_UNC_C6_PMON_CTL3, MSR_UNC_C6_PMON_CTR3, 0, 0},
-    {"CBOX7C0", PMC40, CBOX7, MSR_UNC_C7_PMON_CTL0, MSR_UNC_C7_PMON_CTR0, 0, 0},
-    {"CBOX7C1", PMC41, CBOX7, MSR_UNC_C7_PMON_CTL1, MSR_UNC_C7_PMON_CTR1, 0, 0},
-    {"CBOX7C2", PMC42, CBOX7, MSR_UNC_C7_PMON_CTL2, MSR_UNC_C7_PMON_CTR2, 0, 0},
-    {"CBOX7C3", PMC43, CBOX7, MSR_UNC_C7_PMON_CTL3, MSR_UNC_C7_PMON_CTR3, 0, 0},
-    {"CBOX8C0", PMC44, CBOX8, MSR_UNC_C8_PMON_CTL0, MSR_UNC_C8_PMON_CTR0, 0, 0},
-    {"CBOX8C1", PMC45, CBOX8, MSR_UNC_C8_PMON_CTL1, MSR_UNC_C8_PMON_CTR1, 0, 0},
-    {"CBOX8C2", PMC46, CBOX8, MSR_UNC_C8_PMON_CTL2, MSR_UNC_C8_PMON_CTR2, 0, 0},
-    {"CBOX8C3", PMC47, CBOX8, MSR_UNC_C8_PMON_CTL3, MSR_UNC_C8_PMON_CTR3, 0, 0},
-    {"CBOX9C0", PMC48, CBOX9, MSR_UNC_C9_PMON_CTL0, MSR_UNC_C9_PMON_CTR0, 0, 0},
-    {"CBOX9C1", PMC49, CBOX9, MSR_UNC_C9_PMON_CTL1, MSR_UNC_C9_PMON_CTR1, 0, 0},
-    {"CBOX9C2", PMC50, CBOX9, MSR_UNC_C9_PMON_CTL2, MSR_UNC_C9_PMON_CTR2, 0, 0},
-    {"CBOX9C3", PMC51, CBOX9, MSR_UNC_C9_PMON_CTL3, MSR_UNC_C9_PMON_CTR3, 0, 0},
-    {"CBOX9C0", PMC52, CBOX10, MSR_UNC_C10_PMON_CTL0, MSR_UNC_C10_PMON_CTR0, 0, 0},
-    {"CBOX9C1", PMC53, CBOX10, MSR_UNC_C10_PMON_CTL1, MSR_UNC_C10_PMON_CTR1, 0, 0},
-    {"CBOX9C2", PMC54, CBOX10, MSR_UNC_C10_PMON_CTL2, MSR_UNC_C10_PMON_CTR2, 0, 0},
-    {"CBOX9C3", PMC55, CBOX10, MSR_UNC_C10_PMON_CTL3, MSR_UNC_C10_PMON_CTR3, 0, 0},
-    {"CBOX9C0", PMC56, CBOX11, MSR_UNC_C11_PMON_CTL0, MSR_UNC_C11_PMON_CTR0, 0, 0},
-    {"CBOX9C1", PMC57, CBOX11, MSR_UNC_C11_PMON_CTL1, MSR_UNC_C11_PMON_CTR1, 0, 0},
-    {"CBOX9C2", PMC58, CBOX11, MSR_UNC_C11_PMON_CTL2, MSR_UNC_C11_PMON_CTR2, 0, 0},
-    {"CBOX9C3", PMC59, CBOX11, MSR_UNC_C11_PMON_CTL3, MSR_UNC_C11_PMON_CTR3, 0, 0},
     /* IMC Counters: 4 48bit wide per memory channel, split in two reads */
-    {"MBOX0C0",PMC60, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX1C0",PMC61, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX2C0",PMC62, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX3C0",PMC63, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX0C1",PMC64, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX1C1",PMC65, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX2C1",PMC66, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX3C1",PMC67, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX0C2",PMC68, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX1C2",PMC69, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX2C2",PMC70, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX3C2",PMC71, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX0C3",PMC72, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX1C3",PMC73, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX2C3",PMC74, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX3C3",PMC75, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOXFIX",PMC76, MBOXFIX, 0, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_UNC_MC_PMON_FIXED_CTL},
-    /* QPI counters four 48bit  wide per port, split in two reads */
-    {"SBOX0P0",PMC77, SBOX0, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0},
-    {"SBOX1P0",PMC78, SBOX0, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0},
-    {"SBOX2P0",PMC79, SBOX0, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0},
-    {"SBOX3P0",PMC80, SBOX0, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0},
-    {"SBOX0P1",PMC81, SBOX1, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1},
-    {"SBOX1P1",PMC82, SBOX1, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1},
-    {"SBOX2P1",PMC83, SBOX1, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1},
-    {"SBOX3P1",PMC84, SBOX1, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1}
+    {"MBOX0C0",PMC12, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_2},
+    {"MBOX0C1",PMC13, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_2},
+    {"MBOX0C2",PMC14, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_2},
+    {"MBOX0C3",PMC15, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_2},
+    {"MBOX0FIX",PMC16, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_2},
+    {"MBOX1C0",PMC17, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_3},
+    {"MBOX1C1",PMC18, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_3},
+    {"MBOX1C2",PMC19, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_3},
+    {"MBOX1C3",PMC20, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_3},
+    {"MBOX1FIX",PMC21, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_3},
+    {"MBOX2C0",PMC22, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_0},
+    {"MBOX2C1",PMC23, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_0},
+    {"MBOX2C2",PMC24, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_0},
+    {"MBOX2C3",PMC25, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_0},
+    {"MBOX2FIX",PMC26, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_0},
+    {"MBOX3C0",PMC27, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_1},
+    {"MBOX3C1",PMC28, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_1},
+    {"MBOX3C2",PMC29, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_1},
+    {"MBOX3C3",PMC30, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_1},
+    {"MBOX3FIX",PMC31, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_1},
 };
 
 
diff --git a/src/includes/perfmon_ivybridge_events.txt b/src/includes/perfmon_ivybridge_events.txt
index f2cb185..5318ce6 100644
--- a/src/includes/perfmon_ivybridge_events.txt
+++ b/src/includes/perfmon_ivybridge_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for Intel Ivy Bridge
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
@@ -131,7 +131,9 @@ UMASK_LOAD_HIT_PRE_HW_PF               0x02
 
 EVENT_L1D                        0x51   PMC
 UMASK_L1D_REPLACEMENT             0x01
-UMASK_L1D_M_EVICT             0x04
+UMASK_L1D_ALLOCATED_IN_M          0x02
+UMASK_L1D_M_EVICT                 0x04
+UMASK_L1D_ALL_M_REPLACEMENT       0x08
 
 EVENT_MOVE_ELIMINATION                        0x58   PMC
 UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED     0x04
@@ -173,7 +175,10 @@ UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24 0x00 0x04
 UMASK_IDQ_ALL_MITE_ALL_UOPS       0x3C
 
 EVENT_ICACHE                  0x80   PMC
+UMASK_ICACHE_HITS             0x01
 UMASK_ICACHE_MISSES             0x02
+UMASK_ICACHE_ACCESSES           0x03
+UMASK_ICACHE_IFETCH_STALL       0x04
 
 EVENT_ITLB_MISSES                 0x85      PMC
 UMASK_ITLB_MISSES_CAUSES_A_WALK   0x01
@@ -320,12 +325,17 @@ UMASK_MEM_UOP_RETIRED_STORES_LOCK             0x22
 UMASK_MEM_UOP_RETIRED_LOADS_SPLIT             0x41
 UMASK_MEM_UOP_RETIRED_STORES_SPLIT            0x42
 
-EVENT_MEMLOAD_UOPS_RETIRED               0xD1   PMC
-UMASK_MEMLOAD_UOPS_RETIRED_L1_HIT       0x01
-UMASK_MEMLOAD_UOPS_RETIRED_L2_HIT       0x02
-UMASK_MEMLOAD_UOPS_RETIRED_LLC_HIT      0x04
-UMASK_MEMLOAD_UOPS_RETIRED_LLC_MISS     0x20
-UMASK_MEMLOAD_UOPS_RETIRED_HIT_LFB      0x40
+EVENT_MEM_LOAD_UOPS_RETIRED               0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
 
 EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED               0xD2   PMC
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS         0x01
@@ -368,178 +378,6 @@ UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_DRAM     0x0C
 UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM     0x10
 UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_FWD     0x20
 
-EVENT_CBO_CLOCKTICKS                         0x00  CBOX
-UMASK_CBO_CLOCKTICKS                         0x00
-
-EVENT_COUNTER0_OCCUPANCY              0x1F  CBOX
-UMASK_COUNTER0_OCCUPANCY              0x00
-
-EVENT_LLC_LOOKUP              0x34  CBOX0|CBOX1
-UMASK_LLC_LOOKUP_DATA_READ          0x03
-UMASK_LLC_LOOKUP_WRITE              0x05
-UMASK_LLC_LOOKUP_REMOTE_SNOOP       0x09
-UMASK_LLC_LOOKUP_ANY                0x11
-UMASK_LLC_LOOKUP_NID                0x41
-
-EVENT_LLC_VICTIMS              0x37  CBOX0|CBOX1
-UMASK_LLC_VICTIMS_M_STATE      0x01
-UMASK_LLC_VICTIMS_E_STATE      0x02
-UMASK_LLC_VICTIMS_S_STATE      0x04
-UMASK_LLC_VICTIMS_MISS         0x08
-UMASK_LLC_VICTIMS_NID          0x40
-
-EVENT_CBO_MISC              0x39  CBOX0|CBOX1
-UMASK_CBO_MISC_RSPI_WAS_FSE      0x01
-UMASK_CBO_MISC_WC_ALIASING       0x02
-UMASK_CBO_MISC_STARTED           0x04
-UMASK_CBO_MISC_RFO_HIT_S         0x08
-
-EVENT_RING_AD_USED               0x1B  CBOX2|CBOX3
-UMASK_RING_AD_USED_0_UP_EVEN      0x01
-UMASK_RING_AD_USED_0_UP_ODD       0x02
-UMASK_RING_AD_USED_0_DOWN_EVEN    0x04
-UMASK_RING_AD_USED_0_DOWN_ODD     0x08
-UMASK_RING_AD_USED_1_UP_EVEN      0x10
-UMASK_RING_AD_USED_1_UP_ODD       0x20
-UMASK_RING_AD_USED_1_DOWN_EVEN    0x40
-UMASK_RING_AD_USED_1_DOWN_ODD     0x80
-UMASK_RING_AD_USED_DOWN           0xCC
-UMASK_RING_AD_USED_UP             0x33
-
-EVENT_RING_AK_USED              0x1C  CBOX2|CBOX3
-UMASK_RING_AK_USED_0_UP_EVEN      0x01
-UMASK_RING_AK_USED_0_UP_ODD       0x02
-UMASK_RING_AK_USED_0_DOWN_EVEN    0x04
-UMASK_RING_AK_USED_0_DOWN_ODD     0x08
-UMASK_RING_AK_USED_1_UP_EVEN      0x10
-UMASK_RING_AK_USED_1_UP_ODD       0x20
-UMASK_RING_AK_USED_1_DOWN_EVEN    0x40
-UMASK_RING_AK_USED_1_DOWN_ODD     0x80
-UMASK_RING_AK_USED_DOWN           0xCC
-UMASK_RING_AK_USED_UP             0x33
-
-EVENT_RING_BL_USED              0x1D  CBOX2|CBOX3
-UMASK_RING_BL_USED_0_UP_EVEN      0x01
-UMASK_RING_BL_USED_0_UP_ODD       0x02
-UMASK_RING_BL_USED_0_DOWN_EVEN    0x04
-UMASK_RING_BL_USED_0_DOWN_ODD     0x08
-UMASK_RING_BL_USED_1_UP_EVEN      0x10
-UMASK_RING_BL_USED_1_UP_ODD       0x20
-UMASK_RING_BL_USED_1_DOWN_EVEN    0x40
-UMASK_RING_BL_USED_1_DOWN_ODD     0x80
-UMASK_RING_BL_USED_DOWN           0xCC
-UMASK_RING_BL_USED_UP             0x33
-
-EVENT_RING_BOUNCES              0x05  CBOX0|CBOX1
-UMASK_RING_BOUNCES_AK_IRQ       0x02
-UMASK_RING_BOUNCES_AK_CORE      0x04
-UMASK_RING_BOUNCES_BL_CORE      0x08
-UMASK_RING_BOUNCES_IV_CORE      0x01
-
-EVENT_RING_IV_USED              0x1E  CBOX2|CBOX3
-UMASK_RING_IV_USED_ANY           0x0F
-UMASK_RING_IV_USED_UP            0x33
-UMASK_RING_IV_USED_DOWN          0xCC
-
-EVENT_RING_SRC_THRTL            0x07  CBOX0|CBOX1
-UMASK_RING_SRC_THRTL            0x00
-
-EVENT_RXR_EXT_STARVED               0x12  CBOX0|CBOX1
-UMASK_RXR_EXT_STARVED_IRQ           0x01
-UMASK_RXR_EXT_STARVED_IPQ           0x02
-UMASK_RXR_EXT_STARVED_PRQ           0x04
-UMASK_RXR_EXT_STARVED_ISMQ_BIDS     0x08
-
-EVENT_RXR_INSERTS                0x13  CBOX0|CBOX1
-UMASK_RXR_INSERTS_IRQ            0x01
-UMASK_RXR_INSERTS_IRQ_REJECTED   0x02
-UMASK_RXR_INSERTS_IPQ            0x04
-UMASK_RXR_INSERTS_VFIFO          0x10
-
-EVENT_RXR_IPQ_RETRY                0x31  CBOX0|CBOX1
-UMASK_RXR_IPQ_RETRY_ANY            0x01
-UMASK_RXR_IPQ_RETRY_FULL           0x02
-UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT  0x04
-UMASK_RXR_IPQ_RETRY_QPI_CREDITS    0x10
-
-EVENT_RXR_IRQ_RETRY                0x32  CBOX0|CBOX1
-UMASK_RXR_IRQ_RETRY_ANY            0x01
-UMASK_RXR_IRQ_RETRY_FULL           0x02
-UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT  0x04
-UMASK_RXR_IRQ_RETRY_RTID           0x08
-UMASK_RXR_IRQ_RETRY_QPI_CREDITS    0x10
-UMASK_RXR_IRQ_RETRY_HO_CREDITS     0x20
-
-EVENT_RXR_ISMQ_RETRY                0x33  CBOX0|CBOX1
-UMASK_RXR_ISMQ_RETRY_ANY            0x01
-UMASK_RXR_ISMQ_RETRY_FULL           0x02
-UMASK_RXR_ISMQ_RETRY_RTID           0x08
-UMASK_RXR_ISMQ_RETRY_QPI_CREDITS    0x10
-UMASK_RXR_ISMQ_RETRY_HO_CREDITS     0x20
-UMASK_RXR_ISMQ_RETRY_WB_CREDITS     0x80
-
-EVENT_RXR_OCCUPANCY                0x11  CBOX0
-UMASK_RXR_OCCUPANCY_IRQ            0x01
-UMASK_RXR_OCCUPANCY_IRQ_REJECTED   0x02
-UMASK_RXR_OCCUPANCY_IPQ            0x04
-UMASK_RXR_OCCUPANCY_VIFO           0x10
-
-EVENT_TOR_INSERTS                    0x35  CBOX0|CBOX1
-UMASK_TOR_INSERTS_OPCODE             0x01
-UMASK_TOR_INSERTS_MISS_OPCODE        0x03
-UMASK_TOR_INSERTS_EVICTION           0x04
-UMASK_TOR_INSERTS_ALL                0x08
-UMASK_TOR_INSERTS_WB                 0x10
-UMASK_TOR_INSERTS_MISS_ALL           0x0A
-UMASK_TOR_INSERTS_MISS_LOCAL         0x2A
-UMASK_TOR_INSERTS_MISS_LOCAL_OPCODE  0x23
-UMASK_TOR_INSERTS_NID_OPCODE         0x41
-UMASK_TOR_INSERTS_NID_EVICTION       0x44
-UMASK_TOR_INSERTS_NID_ALL            0x48
-UMASK_TOR_INSERTS_NID_WB             0x50
-UMASK_TOR_INSERTS_NID_MISS_OPCODE    0x43
-UMASK_TOR_INSERTS_NID_MISS_ALL       0x4A
-UMASK_TOR_INSERTS_REMOTE_OPCODE      0x81
-UMASK_TOR_INSERTS_MISS_REMOTE_OPCODE   0x83
-UMASK_TOR_INSERTS_REMOTE             0x88
-UMASK_TOR_INSERTS_MISS_REMOTE        0x8A
-
-EVENT_TOR_OCCUPANCY                    0x36  CBOX0
-UMASK_TOR_OCCUPANCY_OPCODE             0x01
-UMASK_TOR_OCCUPANCY_MISS_OPCODE        0x03
-UMASK_TOR_OCCUPANCY_EVICTION           0x04
-UMASK_TOR_OCCUPANCY_ALL                0x08
-UMASK_TOR_OCCUPANCY_MISS_ALL           0x0A
-UMASK_TOR_OCCUPANCY_WB                 0x10
-UMASK_TOR_OCCUPANCY_LOCAL_OPCODE       0x21
-UMASK_TOR_OCCUPANCY_MISS_LOCAL_OPCODE  0x23
-UMASK_TOR_OCCUPANCY_LOCAL              0x28
-UMASK_TOR_OCCUPANCY_MISS_LOCAL         0x2A
-UMASK_TOR_OCCUPANCY_NID_OPCODE         0x41
-UMASK_TOR_OCCUPANCY_NID_EVICTION       0x44
-UMASK_TOR_OCCUPANCY_NID_ALL            0x48
-UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE    0x43
-UMASK_TOR_OCCUPANCY_NID_MISS_ALL       0x4A
-UMASK_TOR_OCCUPANCY_NID_WB             0x50
-UMASK_TOR_OCCUPANCY_REMOTE_OPCODE       0x81
-UMASK_TOR_OCCUPANCY_MISS_REMOTE_OPCODE  0x83
-UMASK_TOR_OCCUPANCY_REMOTE              0x88
-UMASK_TOR_OCCUPANCY_MISS_REMOTE         0x8A
-
-EVENT_TXR_ADS_USED                0x04  CBOX0|CBOX1
-UMASK_TXR_ADS_USED_AD            0x01
-UMASK_TXR_ADS_USED_AK            0x02
-UMASK_TXR_ADS_USED_BL            0x04
-
-EVENT_TXR_INSERTS                0x02  CBOX0|CBOX1
-UMASK_TXR_INSERTS_AD_CACHE            0x01
-UMASK_TXR_INSERTS_AK_CACHE            0x02
-UMASK_TXR_INSERTS_BL_CACHE            0x04
-UMASK_TXR_INSERTS_IV_CACHE            0x08
-UMASK_TXR_INSERTS_AD_CORE             0x10
-UMASK_TXR_INSERTS_AK_CORE             0x20
-UMASK_TXR_INSERTS_BL_CORE             0x40
-
 EVENT_DRAM_CLOCKTICKS             0x00  MBOX
 UMASK_DRAM_CLOCKTICKS             0x00
 
@@ -821,13 +659,3 @@ UMASK_WR_CAS_RANK7_BANK4           0x10
 UMASK_WR_CAS_RANK7_BANK5           0x20
 UMASK_WR_CAS_RANK7_BANK6           0x40
 UMASK_WR_CAS_RANK7_BANK7           0x80
-
-
-
-
-
-
-
-
-
-
diff --git a/src/includes/perfmon_k10.h b/src/includes/perfmon_k10.h
index 45274fd..cc614af 100644
--- a/src/includes/perfmon_k10.h
+++ b/src/includes/perfmon_k10.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header file of perfmon module for K10
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -46,12 +46,12 @@ void perfmon_init_k10(PerfmonThread *thread)
     msr_write(cpu_id, MSR_AMD_PERFEVTSEL2, 0x0ULL);
     msr_write(cpu_id, MSR_AMD_PERFEVTSEL3, 0x0ULL);
 
-    flags |= (1<<16);  /* user mode flag */
+    //flags |= (1<<16);  /* user mode flag */
 
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL0, flags);
+    /*msr_write(cpu_id, MSR_AMD_PERFEVTSEL0, flags);
     msr_write(cpu_id, MSR_AMD_PERFEVTSEL1, flags);
     msr_write(cpu_id, MSR_AMD_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL3, flags);
+    msr_write(cpu_id, MSR_AMD_PERFEVTSEL3, flags);*/
 }
 
 
@@ -65,8 +65,7 @@ void perfmon_setupCounterThread_k10(
     int cpu_id = perfmon_threadData[thread_id].processorId;
     perfmon_threadData[thread_id].counters[index].init = TRUE;
 
-    flags = msr_read(cpu_id,reg);
-    flags &= ~(0xFFFFU); 
+    flags |= (1<<16);
 
     /* AMD uses a 12 bit Event mask: [35:32][7:0] */
     flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
diff --git a/src/includes/perfmon_k10_counters.h b/src/includes/perfmon_k10_counters.h
index e07c23a..d01be3d 100644
--- a/src/includes/perfmon_k10_counters.h
+++ b/src/includes/perfmon_k10_counters.h
@@ -5,8 +5,8 @@
  *
  *      Description:  AMD K10 specific subroutines
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_k10_events.txt b/src/includes/perfmon_k10_events.txt
index c4a89e2..64c20e9 100644
--- a/src/includes/perfmon_k10_events.txt
+++ b/src/includes/perfmon_k10_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for AMD K10
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_k8.h b/src/includes/perfmon_k8.h
index 2f393f8..9313168 100644
--- a/src/includes/perfmon_k8.h
+++ b/src/includes/perfmon_k8.h
@@ -7,8 +7,8 @@
  *                    Configures and reads out performance counters
  *                    on x86 based architectures. Supports multi threading.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_k8_events.txt b/src/includes/perfmon_k8_events.txt
index 7f93db2..127b56f 100644
--- a/src/includes/perfmon_k8_events.txt
+++ b/src/includes/perfmon_k8_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for AMD K8
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_kabini.h b/src/includes/perfmon_kabini.h
index 476636a..018eb04 100644
--- a/src/includes/perfmon_kabini.h
+++ b/src/includes/perfmon_kabini.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header file of perfmon module for AMD Family16
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -57,11 +57,11 @@ void perfmon_init_kabini(PerfmonThread *thread)
         msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL3, 0x0ULL);
     }
 
-    flags |= (1<<16);  /* user mode flag */
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL0, flags);
+    //flags |= (1<<16);  /* user mode flag */
+    /*msr_write(cpu_id, MSR_AMD16_PERFEVTSEL0, flags);
     msr_write(cpu_id, MSR_AMD16_PERFEVTSEL1, flags);
     msr_write(cpu_id, MSR_AMD16_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL3, flags);
+    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL3, flags);*/
 }
 
 
@@ -70,7 +70,7 @@ void perfmon_setupCounterThread_kabini(
         PerfmonEvent* event,
         PerfmonCounterIndex index)
 {
-    uint64_t flags;
+    uint64_t flags = 0x0ULL;
     uint64_t reg = kabini_counter_map[index].configRegister;
     int cpu_id = perfmon_threadData[thread_id].processorId;
     perfmon_threadData[thread_id].counters[index].init = TRUE;
@@ -82,8 +82,10 @@ void perfmon_setupCounterThread_kabini(
         return;
     }
 
-    flags = msr_read(cpu_id,reg);
-    flags &= ~(0xFFFFU); 
+    if (kabini_counter_map[index].type == PMC)
+    {
+        flags |= (1<<16);
+    }
 
     /* AMD uses a 12 bit Event mask: [35:32][7:0] */
     flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
diff --git a/src/includes/perfmon_kabini_counters.h b/src/includes/perfmon_kabini_counters.h
index 9cea474..8662522 100644
--- a/src/includes/perfmon_kabini_counters.h
+++ b/src/includes/perfmon_kabini_counters.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Counter Header File of perfmon module for AMD Family16
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_kabini_events.txt b/src/includes/perfmon_kabini_events.txt
index 4f28024..9ccc726 100644
--- a/src/includes/perfmon_kabini_events.txt
+++ b/src/includes/perfmon_kabini_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for AMD Kabini
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author: saravanan.ekanathan at amd.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_nehalem.h b/src/includes/perfmon_nehalem.h
index 99f2422..b3e7907 100644
--- a/src/includes/perfmon_nehalem.h
+++ b/src/includes/perfmon_nehalem.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File of perfmon module for Nehalem.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -64,17 +64,17 @@ void perfmon_init_nehalem(PerfmonThread *thread)
      * FIXED 0: Instructions retired
      * FIXED 1: Clocks unhalted core
      * FIXED 2: Clocks unhalted ref */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
+    //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
 
     //    flags |= (1<<22);  /* enable flag */
     //    flags |= (1<<16);  /* user mode flag */
-    setBit(flags,16); /* set user mode flag */
-    setBit(flags,22); /* set enable flag */
+    //setBit(flags,16); /* set user mode flag */
+    //setBit(flags,22); /* set enable flag */
 
-    msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
+    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
     msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
     msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);
+    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
 
 
     if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
@@ -105,19 +105,19 @@ void perfmon_init_nehalem(PerfmonThread *thread)
         msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL);
         msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
         msr_write(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL);
-        msr_write(cpu_id, MSR_OFFCORE_RSP0, 0x0ULL);
+        msr_write(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL);
 
         /* Preinit of PERFEVSEL registers */
-        clearBit(flags,16); /* set enable flag */
+        //clearBit(flags,16); /* set enable flag */
 
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL0, flags);
+        /*msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL0, flags);
         msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL1, flags);
         msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL2, flags);
         msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL3, flags);
         msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL4, flags);
         msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL5, flags);
         msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL6, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL7, flags);
+        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL7, flags);*/
     }
 }
 
@@ -128,20 +128,21 @@ void perfmon_setupCounterThread_nehalem(
         PerfmonCounterIndex index)
 {
     int haveLock = 0;
-    uint64_t flags;
+    uint64_t flags = 0x0ULL;
     uint64_t reg = nehalem_counter_map[index].configRegister;
     int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
 
     if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
     {
         haveLock = 1;
     }
 
+    perfmon_threadData[thread_id].counters[index].init = TRUE;
+
     if ( nehalem_counter_map[index].type == PMC )
     {
-        perfmon_threadData[thread_id].counters[index].init = TRUE;
-        flags = msr_read(cpu_id,reg);
-        flags &= ~(0xFFFFU);  /* clear lower 16bits */
+        flags = (1<<16)|(1<<22);
 
         /* Intel with standard 8 bit event mask: [7:0] */
         flags |= (event->umask<<8) + event->eventId;
@@ -166,9 +167,7 @@ void perfmon_setupCounterThread_nehalem(
     {
         if(haveLock)
         {
-            perfmon_threadData[thread_id].counters[index].init = TRUE;
-            flags = msr_read(cpu_id,reg);
-            flags &= ~(0xFFFFU);  /* clear lower 16bits */
+            flags = (1<<22);
 
             /* Intel with standard 8 bit event mask: [7:0] */
             flags |= (event->umask<<8) + event->eventId;
@@ -193,7 +192,8 @@ void perfmon_setupCounterThread_nehalem(
     }
     else if (nehalem_counter_map[index].type == FIXED)
     {
-        perfmon_threadData[thread_id].counters[index].init = TRUE;
+        fixed_flags |= (0x2 <<(index*4));
+        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
     }
 }
 
@@ -214,7 +214,7 @@ void perfmon_startCountersThread_nehalem(int thread_id)
         uflags = 0x100000000ULL;
     }
 
-    for ( int i=0; i<NUM_PMC; i++ ) 
+    for ( int i=0; i<NUM_PMC; i++ )
     {
         if (perfmon_threadData[thread_id].counters[i].init == TRUE)
         {
@@ -287,11 +287,11 @@ void perfmon_stopCountersThread_nehalem(int thread_id)
     }
 
     flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    printf ("Status: 0x%llX \n", LLU_CAST flags);
 
-    if((flags & 0x3) || (flags & (0x3ULL<<32)) ) 
+    if((flags & 0x3) || (flags & (0x3ULL<<32)) )
     {
         printf ("Overflow occured \n");
+        printf ("Status: 0x%llX \n", LLU_CAST flags);
     }
 }
 
diff --git a/src/includes/perfmon_nehalemEX.h b/src/includes/perfmon_nehalemEX.h
index 84457ae..ea632cf 100644
--- a/src/includes/perfmon_nehalemEX.h
+++ b/src/includes/perfmon_nehalemEX.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File of perfmon module for Nehalem EX.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -38,12 +38,299 @@ static int perfmon_numGroupsNehalemEX = NUM_GROUPS_NEHALEMEX;
 static int perfmon_numArchEventsNehalemEX = NUM_ARCH_EVENTS_NEHALEMEX;
 
 /* This SUCKS: There are only subtle difference between NehalemEX
- * and Westmere EX Uncore. Still one of them is that one field is 
- * 1 bit shifted. Thank you Intel for this mess!!! Do you want 
+ * and Westmere EX Uncore. Still one of them is that one field is
+ * 1 bit shifted. Thank you Intel for this mess!!! Do you want
  * to change the register definitions for every architecture?*/
 
-/* MBOX macros */
 
+void perfmon_init_nehalemEX(PerfmonThread *thread)
+{
+    uint64_t flags = 0x0ULL;
+    int cpu_id = thread->processorId;
+    perfmon_verbose = 1;
+    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
+    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
+    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
+    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
+    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
+    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
+    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
+    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
+    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
+    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
+    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
+    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
+    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
+    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
+
+    /* initialize fixed counters
+     * FIXED 0: Instructions retired
+     * FIXED 1: Clocks unhalted core
+     * FIXED 2: Clocks unhalted ref */
+    //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
+
+    /* Preinit of PERFEVSEL registers */
+    //flags |= (1<<22);  /* enable flag */
+    //flags |= (1<<16);  /* user mode flag */
+
+    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
+    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
+    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
+    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
+
+    /* Initialize uncore */
+    /* MBOX */
+    thread->counters[PMC7].id  = 0;
+    thread->counters[PMC8].id  = 1;
+    thread->counters[PMC9].id  = 2;
+    thread->counters[PMC10].id = 3;
+    thread->counters[PMC11].id = 4;
+    thread->counters[PMC12].id = 5;
+    westmereEX_PMunits[MBOX0].ctrlRegister = MSR_M0_PMON_BOX_CTRL;
+    westmereEX_PMunits[MBOX0].statusRegister = MSR_M0_PMON_BOX_STATUS;
+    westmereEX_PMunits[MBOX0].ovflRegister = MSR_M0_PMON_BOX_OVF_CTRL;
+
+    thread->counters[PMC13].id = 0;
+    thread->counters[PMC14].id = 1;
+    thread->counters[PMC15].id = 2;
+    thread->counters[PMC16].id = 3;
+    thread->counters[PMC17].id = 4;
+    thread->counters[PMC18].id = 5;
+    westmereEX_PMunits[MBOX1].ctrlRegister = MSR_M1_PMON_BOX_CTRL;
+    westmereEX_PMunits[MBOX1].statusRegister = MSR_M1_PMON_BOX_STATUS;
+    westmereEX_PMunits[MBOX1].ovflRegister = MSR_M1_PMON_BOX_OVF_CTRL;
+
+    /* BBOX */
+    thread->counters[PMC19].id = 0;
+    thread->counters[PMC20].id = 1;
+    thread->counters[PMC21].id = 2;
+    thread->counters[PMC22].id = 3;
+    westmereEX_PMunits[BBOX0].ctrlRegister = MSR_B0_PMON_BOX_CTRL;
+    westmereEX_PMunits[BBOX0].statusRegister =  MSR_B0_PMON_BOX_STATUS;
+    westmereEX_PMunits[BBOX0].ovflRegister = MSR_B0_PMON_BOX_OVF_CTRL;
+
+    thread->counters[PMC23].id = 0;
+    thread->counters[PMC24].id = 1;
+    thread->counters[PMC25].id = 2;
+    thread->counters[PMC26].id = 3;
+    westmereEX_PMunits[BBOX1].ctrlRegister = MSR_B1_PMON_BOX_CTRL;
+    westmereEX_PMunits[BBOX1].statusRegister =  MSR_B1_PMON_BOX_STATUS;
+    westmereEX_PMunits[BBOX1].ovflRegister = MSR_B1_PMON_BOX_OVF_CTRL;
+
+    /* RBOX */
+    thread->counters[PMC27].id = 0;
+    thread->counters[PMC28].id = 1;
+    thread->counters[PMC29].id = 2;
+    thread->counters[PMC30].id = 3;
+    thread->counters[PMC31].id = 4;
+    thread->counters[PMC32].id = 5;
+    thread->counters[PMC33].id = 6;
+    thread->counters[PMC34].id = 7;
+    westmereEX_PMunits[RBOX0].ctrlRegister = MSR_R0_PMON_BOX_CTRL;
+    westmereEX_PMunits[RBOX0].statusRegister =  MSR_R0_PMON_BOX_STATUS;
+    westmereEX_PMunits[RBOX0].ovflRegister = MSR_R0_PMON_BOX_OVF_CTRL;
+
+    thread->counters[PMC35].id = 0;
+    thread->counters[PMC36].id = 1;
+    thread->counters[PMC37].id = 2;
+    thread->counters[PMC38].id = 3;
+    thread->counters[PMC39].id = 4;
+    thread->counters[PMC40].id = 5;
+    thread->counters[PMC41].id = 6;
+    thread->counters[PMC42].id = 7;
+    westmereEX_PMunits[RBOX1].ctrlRegister = MSR_R1_PMON_BOX_CTRL;
+    westmereEX_PMunits[RBOX1].statusRegister =  MSR_R1_PMON_BOX_STATUS;
+    westmereEX_PMunits[RBOX1].ovflRegister = MSR_R1_PMON_BOX_OVF_CTRL;
+
+    /* WBOX */
+    thread->counters[PMC43].id = 0;
+    thread->counters[PMC44].id = 1;
+    thread->counters[PMC45].id = 2;
+    thread->counters[PMC46].id = 3;
+    thread->counters[PMC47].id = 31;
+    westmereEX_PMunits[WBOX].ctrlRegister   = MSR_W_PMON_BOX_CTRL;
+    westmereEX_PMunits[WBOX].statusRegister = MSR_W_PMON_BOX_STATUS;
+    westmereEX_PMunits[WBOX].ovflRegister   = MSR_W_PMON_BOX_OVF_CTRL;
+
+    thread->counters[PMC48].id = 0;
+    westmereEX_PMunits[UBOX].ctrlRegister   = MSR_U_PMON_GLOBAL_CTRL;
+    westmereEX_PMunits[UBOX].statusRegister = MSR_U_PMON_GLOBAL_STATUS;
+    westmereEX_PMunits[UBOX].ovflRegister   = MSR_U_PMON_GLOBAL_OVF_CTRL;
+
+    /* Set IDs for all CBOXes */
+    for (int i=PMC49; i<=PMC88; i+= 5)
+    {
+        for(int j=0; j<5; j++)
+        {
+            thread->counters[i].id = j;
+        }
+    }
+    westmereEX_PMunits[CBOX0].ctrlRegister   = MSR_C0_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX0].statusRegister = MSR_C0_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX0].ovflRegister   = MSR_C0_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX1].ctrlRegister   = MSR_C1_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX1].statusRegister = MSR_C1_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX1].ovflRegister   = MSR_C1_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX2].ctrlRegister   = MSR_C2_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX2].statusRegister = MSR_C2_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX2].ovflRegister   = MSR_C2_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX3].ctrlRegister   = MSR_C3_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX3].statusRegister = MSR_C3_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX3].ovflRegister   = MSR_C3_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX4].ctrlRegister   = MSR_C4_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX4].statusRegister = MSR_C4_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX4].ovflRegister   = MSR_C4_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX5].ctrlRegister   = MSR_C5_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX5].statusRegister = MSR_C5_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX5].ovflRegister   = MSR_C5_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX6].ctrlRegister   = MSR_C6_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX6].statusRegister = MSR_C6_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX6].ovflRegister   = MSR_C6_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX7].ctrlRegister   = MSR_C7_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX7].statusRegister = MSR_C7_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX7].ovflRegister   = MSR_C7_PMON_BOX_OVF_CTRL;
+
+    thread->counters[PMC99].id = 0;
+    thread->counters[PMC100].id = 1;
+    thread->counters[PMC101].id = 2;
+    thread->counters[PMC102].id = 3;
+    westmereEX_PMunits[SBOX0].ctrlRegister   = MSR_S0_PMON_BOX_CTRL;
+    westmereEX_PMunits[SBOX0].statusRegister = MSR_S0_PMON_BOX_STATUS;
+    westmereEX_PMunits[SBOX0].ovflRegister   = MSR_S0_PMON_BOX_OVF_CTRL;
+    thread->counters[PMC103].id = 0;
+    thread->counters[PMC104].id = 1;
+    thread->counters[PMC105].id = 2;
+    thread->counters[PMC106].id = 3;
+    westmereEX_PMunits[SBOX1].ctrlRegister   = MSR_S1_PMON_BOX_CTRL;
+    westmereEX_PMunits[SBOX1].statusRegister = MSR_S1_PMON_BOX_STATUS;
+    westmereEX_PMunits[SBOX1].ovflRegister   = MSR_S1_PMON_BOX_OVF_CTRL;
+
+    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
+            lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
+    {
+        msr_write(cpu_id, MSR_W_PMON_BOX_CTRL,  0x0ULL);
+        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_W_PMON_FIXED_CTR, 0x0ULL);
+
+        msr_write(cpu_id, MSR_M0_PMON_BOX_CTRL,  0x0ULL);
+        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL4, 0x0ULL);
+        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL5, 0x0ULL);
+
+        msr_write(cpu_id, MSR_M1_PMON_BOX_CTRL,  0x0ULL);
+        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL4, 0x0ULL);
+        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL5, 0x0ULL);
+
+        msr_write(cpu_id, MSR_B0_PMON_BOX_CTRL,  0x0ULL);
+        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL3, 0x0ULL);
+
+        msr_write(cpu_id, MSR_B1_PMON_BOX_CTRL,  0x0ULL);
+        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL3, 0x0ULL);
+
+        msr_write(cpu_id, MSR_R0_PMON_BOX_CTRL,  0x0ULL);
+        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL4, 0x0ULL);
+        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL5, 0x0ULL);
+        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL6, 0x0ULL);
+        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL7, 0x0ULL);
+
+        msr_write(cpu_id, MSR_R1_PMON_BOX_CTRL,   0x0ULL);
+        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL8,  0x0ULL);
+        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL9,  0x0ULL);
+        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL10, 0x0ULL);
+        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL11, 0x0ULL);
+        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL12, 0x0ULL);
+        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL13, 0x0ULL);
+        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL14, 0x0ULL);
+        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL15, 0x0ULL);
+
+        msr_write(cpu_id, MSR_U_PMON_GLOBAL_EVNT_SEL, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL3, 0x0ULL);
+
+        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL3, 0x0ULL);
+
+        flags = 0x0UL;
+        flags |= (1<<29); /* reset all */
+        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, flags );
+    }
+}
+
+/* MBOX macros */
 #define MBOX_GATE_NEHEX(NUM)  \
 flags = 0x41ULL; \
 switch (event->cfgBits)  \
@@ -249,16 +536,22 @@ void perfmon_setupCounterThread_nehalemEX(
         PerfmonEvent* event,
         PerfmonCounterIndex index)
 {
-    uint64_t flags = 0x0ULL;;
-    uint64_t reg = westmereEX_counter_map[index].configRegister;
+    uint64_t flags = 0x0ULL;
+    int haveLock = 0;
+    uint64_t reg = counter_map[index].configRegister;
     int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
     perfmon_threadData[thread_id].counters[index].init = TRUE;
 
-    switch (westmereEX_counter_map[index].type)
+    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    {
+        haveLock = 1;
+    }
+
+    switch (counter_map[index].type)
     {
         case PMC:
-            flags = msr_read(cpu_id,reg);
-            flags &= ~(0xFFFFU);   /* clear lower 16bits */
+            flags = (1<<22)|(1<<16);
 
             /* Intel with standard 8 bit event mask: [7:0] */
             flags |= (event->umask<<8) + event->eventId;
@@ -274,53 +567,115 @@ void perfmon_setupCounterThread_nehalemEX(
             break;
 
         case FIXED:
+            fixed_flags |= (0x2<<(index*4));
+            msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
             break;
 
         case MBOX0:
-            MBOX_GATE_NEHEX(0);
-            msr_write(cpu_id, reg , flags);
-            VERBOSEPRINTREG(cpu_id, reg, flags, MBOX0_CTRL)
+            if (haveLock)
+            {
+                MBOX_GATE_NEHEX(0);
+                msr_write(cpu_id, reg , flags);
+                VERBOSEPRINTREG(cpu_id, reg, flags, MBOX0_CTRL)
+            }
             break;
 
         case MBOX1:
-            MBOX_GATE_NEHEX(1);
-            msr_write(cpu_id, reg , flags);
-            VERBOSEPRINTREG(cpu_id, reg, flags, MBOX1_CTRL)
+            if (haveLock)
+            {
+                MBOX_GATE_NEHEX(1);
+                msr_write(cpu_id, reg , flags);
+                VERBOSEPRINTREG(cpu_id, reg, flags, MBOX1_CTRL)
+            }
             break;
 
         case BBOX0:
-
         case BBOX1:
-            flags = 0x1ULL; /* set enable bit */
-            flags |=  (event->eventId<<1);
-            msr_write(cpu_id, reg , flags);
-            VERBOSEPRINTREG(cpu_id, reg, flags, BBOX_CTRL)
+            if (haveLock)
+            {
+                flags = 0x1ULL; /* set enable bit */
+                flags |=  (event->eventId<<1);
+                msr_write(cpu_id, reg , flags);
+                VERBOSEPRINTREG(cpu_id, reg, flags, BBOX_CTRL)
+            }
             break;
 
         case RBOX0:
-            RBOX_GATE(0);
-            msr_write(cpu_id, reg , flags);
-            VERBOSEPRINTREG(cpu_id, reg, flags, RBOX0_CTRL)
+            if (haveLock)
+            {
+                RBOX_GATE(0);
+                msr_write(cpu_id, reg , flags);
+                VERBOSEPRINTREG(cpu_id, reg, flags, RBOX0_CTRL)
+            }
             break;
 
         case RBOX1:
-            RBOX_GATE(1);
-            msr_write(cpu_id, reg , flags);
-            VERBOSEPRINTREG(cpu_id, reg, flags, RBOX1_CTRL)
+            if (haveLock)
+            {
+                RBOX_GATE(1);
+                msr_write(cpu_id, reg , flags);
+                VERBOSEPRINTREG(cpu_id, reg, flags, RBOX1_CTRL)
+            }
             break;
 
         case WBOX:
-            if (event->eventId == 0xFF)  /* Fixed Counter */
+            if (haveLock)
             {
-                flags = 0x1ULL; /* set enable bit */
+                if (event->eventId == 0xFF)  /* Fixed Counter */
+                {
+                    flags = 0x1ULL; /* set enable bit */
+                }
+                else
+                {
+                    flags |= (1<<22); /* set enable bit */
+                    flags |= (event->umask<<8) + event->eventId;
+                }
+                msr_write(cpu_id, reg , flags);
+                VERBOSEPRINTREG(cpu_id, reg, flags, WBOX_CTRL)
             }
-            else
+            break;
+
+        case UBOX:
+            if (haveLock)
+            {
+                flags = 0x0ULL;
+                flags |= (1<<22);
+                flags |= event->eventId;
+                fprintf(stderr, "Setup UBOX with value 0x%llx in register 0x%llx, event 0x%x \n", LLU_CAST flags, LLU_CAST reg,event->eventId);
+                msr_write(cpu_id, reg , flags);
+                VERBOSEPRINTREG(cpu_id, reg, flags, UBOX_CTRL)
+            }
+            break;
+
+        case CBOX0:
+        case CBOX1:
+        case CBOX2:
+        case CBOX3:
+        case CBOX4:
+        case CBOX5:
+        case CBOX6:
+        case CBOX7:
+            if (haveLock)
             {
-                flags |= (1<<22); /* set enable bit */
+                flags = 0x0ULL;
+                flags |= (1<<22);
                 flags |= (event->umask<<8) + event->eventId;
+                fprintf(stderr, "Setup CBOX with value 0x%llx in register 0x%llx, event 0x%x umask 0x%x \n", LLU_CAST flags, LLU_CAST reg,event->eventId, event->umask);
+                msr_write(cpu_id, reg , flags);
+                VERBOSEPRINTREG(cpu_id, reg, flags, CBOX_CTRL)
+            }
+            break;
+        case SBOX0:
+        case SBOX1:
+            if (haveLock)
+            {
+                flags = 0x0ULL;
+                flags |= (1<<22);
+                flags |= (event->umask<<8);
+                flags |= (event->eventId);
+                msr_write(cpu_id, reg , flags);
+                VERBOSEPRINTREG(cpu_id, reg, flags, SBOX_CTRL)
             }
-            msr_write(cpu_id, reg , flags);
-            VERBOSEPRINTREG(cpu_id, reg, flags, WBOX_CTRL)
             break;
 
         default:
@@ -329,3 +684,184 @@ void perfmon_setupCounterThread_nehalemEX(
     }
 }
 
+
+/* Actions for Performance Monitoring Session:
+ *
+ * Core Counters (counter is always enabled in PERVSEL register):
+ * 1) Disable counters in global ctrl Register MSR_PERF_GLOBAL_CTRL
+ * 2) Zero according counter registers
+ * 3) Set enable bit in global register flag
+ * 4) Write global register flag
+ *
+ * Uncore Counters (only one core per socket):
+ * 1) Set reset flag in global U Box control register
+ * 2) Zero according counter registers
+ * 3) Set enable bit in Box control register
+ * 4) Write according uncore Box ctrl register
+ * 3) Set enable bit in global U Box control register
+ * */
+
+void perfmon_startCountersThread_nehalemEX(int thread_id)
+{
+    int haveLock = 0;
+    uint64_t flags = 0x0ULL;
+    uint32_t uflags[NUM_UNITS];
+    int enable_ubox = 0;
+    int cpu_id = perfmon_threadData[thread_id].processorId;
+
+    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        uint32_t ubflags = 0x0UL;
+        ubflags |= (1<<29); /* reset all */
+        haveLock = 1;
+        //        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
+        //       VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags, UBOX_GLOBAL_CTRL)
+    }
+
+    for ( int i=0; i<NUM_UNITS; i++ )
+    {
+        uflags[i] = 0x0UL;
+    }
+
+    for ( int i=0; i<NUM_PMC; i++ )
+    {
+        if (perfmon_threadData[thread_id].counters[i].init == TRUE) {
+            if (westmereEX_counter_map[i].type == PMC)
+            {
+                msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
+                flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
+            }
+            else if (westmereEX_counter_map[i].type == FIXED)
+            {
+                msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
+                flags |= (1ULL<<(i+32));  /* enable fixed counter */
+            }
+            else if (westmereEX_counter_map[i].type > UNCORE)
+            {
+                if(haveLock)
+                {
+                    msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
+                    uflags[westmereEX_counter_map[i].type] |=
+                        (1<<(perfmon_threadData[thread_id].counters[i].id));  /* enable uncore counter */
+                    if (westmereEX_counter_map[i].type == UBOX)
+                    {
+                        enable_ubox = 1;
+                    }
+                }
+            }
+        }
+    }
+
+    VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, GLOBAL_CTRL);
+
+    if (haveLock)
+    {
+        for ( int i=0; i<NUM_UNITS; i++ )
+        {
+            /* if counters are enabled write the according box ctrl register */
+            if (uflags[i])
+            {
+                msr_write(cpu_id, westmereEX_PMunits[i].ctrlRegister, uflags[i]);
+                VERBOSEPRINTREG(cpu_id, westmereEX_PMunits[i].ctrlRegister, LLU_CAST uflags[i], BOXCTRL);
+            }
+        }
+
+        /* set global enable flag in U BOX ctrl register */
+        uint32_t ubflags = 0x0UL;
+        ubflags |= (1<<28); /* enable all */
+        if (enable_ubox)
+        {
+            ubflags |= (1<<0);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubflags, UBOX_GLOBAL_CTRL);
+        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
+    }
+    /* Finally enable counters */
+    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
+    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+}
+
+void perfmon_stopCountersThread_nehalemEX(int thread_id)
+{
+    int haveLock = 0;
+    int cpu_id = perfmon_threadData[thread_id].processorId;
+
+    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        uint32_t ubflags = 0x0UL;
+        haveLock = 1;
+        //        ubflags |= (1<<29); /* reset all */
+        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
+    }
+
+    for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ )
+    {
+        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        {
+            if (westmereEX_counter_map[i].type > UNCORE)
+            {
+                if(haveLock)
+                {
+                    perfmon_threadData[thread_id].counters[i].counterData =
+                        msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
+
+                    VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
+                            LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_UNCORE);
+                }
+            }
+            else
+            {
+                perfmon_threadData[thread_id].counters[i].counterData =
+                    msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
+
+                VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
+                        LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_CORE);
+            }
+        }
+    }
+
+#if 0
+    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
+    printf ("Status: 0x%llX \n", LLU_CAST flags);
+    if((flags & 0x3) || (flags & (0x3ULL<<32)) )
+    {
+        printf ("Overflow occured \n");
+    }
+#endif
+}
+
+void perfmon_readCountersThread_nehalemEX(int thread_id)
+{
+    int haveLock = 0;
+    int cpu_id = perfmon_threadData[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ )
+    {
+        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        {
+            if (westmereEX_counter_map[i].type > UNCORE)
+            {
+                if(haveLock)
+                {
+                    perfmon_threadData[thread_id].counters[i].counterData =
+                        msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
+                }
+            }
+            else
+            {
+                perfmon_threadData[thread_id].counters[i].counterData =
+                    msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
+            }
+        }
+    }
+}
+
diff --git a/src/includes/perfmon_nehalemEX_events.txt b/src/includes/perfmon_nehalemEX_events.txt
index 1aa2fa1..565f5ca 100644
--- a/src/includes/perfmon_nehalemEX_events.txt
+++ b/src/includes/perfmon_nehalemEX_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for Intel NehalemEX
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
@@ -534,26 +534,26 @@ UMASK_C_CYCLES_TURBO_C7               0x80
 UMASK_C_CYCLES_TURBO_C_ALL            0xFF
 
 EVENT_C_C0_THROTTLE_DIE              0x01  WBOX
-UMASK_C_C0_THROTTLE_DIE_C0               0x01              
-UMASK_C_C0_THROTTLE_DIE_C1               0x02              
-UMASK_C_C0_THROTTLE_DIE_C2               0x04              
-UMASK_C_C0_THROTTLE_DIE_C3               0x08              
-UMASK_C_C0_THROTTLE_DIE_C4               0x10              
-UMASK_C_C0_THROTTLE_DIE_C5               0x20              
-UMASK_C_C0_THROTTLE_DIE_C6               0x40              
-UMASK_C_C0_THROTTLE_DIE_C7               0x80              
-UMASK_C_C0_THROTTLE_DIE_C_ALL            0xFF              
+UMASK_C_C0_THROTTLE_DIE_C0               0x01
+UMASK_C_C0_THROTTLE_DIE_C1               0x02
+UMASK_C_C0_THROTTLE_DIE_C2               0x04
+UMASK_C_C0_THROTTLE_DIE_C3               0x08
+UMASK_C_C0_THROTTLE_DIE_C4               0x10
+UMASK_C_C0_THROTTLE_DIE_C5               0x20
+UMASK_C_C0_THROTTLE_DIE_C6               0x40
+UMASK_C_C0_THROTTLE_DIE_C7               0x80
+UMASK_C_C0_THROTTLE_DIE_C_ALL            0xFF
 
 EVENT_C_C0_THROTTLE_PROCHOT          0x03  WBOX
-UMASK_C_C0_THROTTLE_PROCHOT_C0               0x01          
-UMASK_C_C0_THROTTLE_PROCHOT_C1               0x02          
-UMASK_C_C0_THROTTLE_PROCHOT_C2               0x04          
-UMASK_C_C0_THROTTLE_PROCHOT_C3               0x08          
-UMASK_C_C0_THROTTLE_PROCHOT_C4               0x10          
-UMASK_C_C0_THROTTLE_PROCHOT_C5               0x20          
-UMASK_C_C0_THROTTLE_PROCHOT_C6               0x40          
-UMASK_C_C0_THROTTLE_PROCHOT_C7               0x80          
-UMASK_C_C0_THROTTLE_PROCHOT_C_ALL            0xFF          
+UMASK_C_C0_THROTTLE_PROCHOT_C0               0x01
+UMASK_C_C0_THROTTLE_PROCHOT_C1               0x02
+UMASK_C_C0_THROTTLE_PROCHOT_C2               0x04
+UMASK_C_C0_THROTTLE_PROCHOT_C3               0x08
+UMASK_C_C0_THROTTLE_PROCHOT_C4               0x10
+UMASK_C_C0_THROTTLE_PROCHOT_C5               0x20
+UMASK_C_C0_THROTTLE_PROCHOT_C6               0x40
+UMASK_C_C0_THROTTLE_PROCHOT_C7               0x80
+UMASK_C_C0_THROTTLE_PROCHOT_C_ALL            0xFF
 
 EVENT_C_C0_THROTTLE_TMP              0x00  WBOX
 UMASK_C_C0_THROTTLE_TMP_C0               0x01
@@ -1463,7 +1463,7 @@ UMASK_GLOBAL_ARB_BID_PORT6_QLX1           0x0F 0x02  0x00
 UMASK_GLOBAL_ARB_BID_PORT7_QLX0           0x14 0x02  0x00
 UMASK_GLOBAL_ARB_BID_PORT7_QLX1           0x15 0x02  0x00
 
-EVENT_GLOBAL_ARB_BID_FAIL                           0x01  RBO0A
+EVENT_GLOBAL_ARB_BID_FAIL                           0x01  RBOX0
 UMASK_GLOBAL_ARB_BID_FAIL_PORT0_QLX0_VN0_HOM        0x02 0x05 0x00
 UMASK_GLOBAL_ARB_BID_FAIL_PORT0_QLX0_VN0_SNP        0x02 0x05 0x01
 UMASK_GLOBAL_ARB_BID_FAIL_PORT0_QLX0_VN0_NDR        0x02 0x05 0x02
@@ -2751,3 +2751,580 @@ UMASK_TARGET_AVAILABLE_PORT7_QLX1_VN1_NCB        0x15 0x09 0x0D
 UMASK_TARGET_AVAILABLE_PORT7_QLX1_VN1_VSM        0x15 0x09 0x0E
 UMASK_TARGET_AVAILABLE_PORT7_QLX1_VN1_VLG        0x15 0x09 0x0F
 
+EVENT_LLC_MISSES                                 0x14 CBOX
+UMASK_LLC_MISSES_SHARED                          0x01
+UMASK_LLC_MISSES_FORWARD                         0x02
+UMASK_LLC_MISSES_INVALID                         0x04
+UMASK_LLC_MISSES_ALL                             0x07
+
+EVENT_LLC_HITS                                   0x15 CBOX
+UMASK_LLC_HITS_MODIFIED                          0x01
+UMASK_LLC_HITS_EXCLUSIVE                         0x02
+UMASK_LLC_HITS_SHARED                            0x04
+UMASK_LLC_HITS_FORWARD                           0x08
+UMASK_LLC_HITS_ALL                               0x0F
+
+EVENT_LLC_S_FILLS                               0x16 CBOX
+UMASK_LLC_S_FILLS_MODIFIED                      0x01
+UMASK_LLC_S_FILLS_EXCLUSIVE                     0x02
+UMASK_LLC_S_FILLS_SHARED                        0x04
+UMASK_LLC_S_FILLS_FORWARD                       0x08
+UMASK_LLC_S_FILLS_ALL                           0x0F
+
+EVENT_LLC_VICTIMS                               0x17 CBOX
+UMASK_LLC_VICTIMS_MODIFIED                      0x01
+UMASK_LLC_VICTIMS_EXCLUSIVE                     0x02
+UMASK_LLC_VICTIMS_SHARED                        0x04
+UMASK_LLC_VICTIMS_FORWARD                       0x08
+UMASK_LLC_VICTIMS_ALL                           0x1F
+UMASK_LLC_VICTIMS_FILL_WITHOUT_VICTIMS          0x10
+
+EVENT_ARB_LOSSES                                 0x0A CBOX
+UMASK_ARB_LOSSES_AD_SB                           0x01
+UMASK_ARB_LOSSES_AD_NSB                          0x02
+UMASK_ARB_LOSSES_AD_ALL                          0x03
+UMASK_ARB_LOSSES_AK_SB                           0x04
+UMASK_ARB_LOSSES_AK_NSB                          0x08
+UMASK_ARB_LOSSES_AK_ALL                          0x0C
+UMASK_ARB_LOSSES_BL_SB                           0x10
+UMASK_ARB_LOSSES_BL_NSB                          0x20
+UMASK_ARB_LOSSES_BL_ALL                          0x30
+UMASK_ARB_LOSSES_IV                              0x40
+UMASK_ARB_LOSSES_ALL                             0x7F
+
+EVENT_ARB_WINS                                 0x0A CBOX
+UMASK_ARB_WINS_AD_SB                           0x01
+UMASK_ARB_WINS_AD_NSB                          0x02
+UMASK_ARB_WINS_AD_ALL                          0x03
+UMASK_ARB_WINS_AK_SB                           0x04
+UMASK_ARB_WINS_AK_NSB                          0x08
+UMASK_ARB_WINS_AK_ALL                          0x0C
+UMASK_ARB_WINS_BL_SB                           0x10
+UMASK_ARB_WINS_BL_NSB                          0x20
+UMASK_ARB_WINS_BL_ALL                          0x30
+UMASK_ARB_WINS_IV                              0x40
+UMASK_ARB_WINS_ALL                             0x7F
+
+EVENT_BOUNCES_C2P_AK                            0x02 CBOX
+UMASK_BOUNCES_C2P_AK_SB                         0x01
+UMASK_BOUNCES_C2P_AK_NSB                        0x02
+UMASK_BOUNCES_C2P_AK_ALL                        0x03
+
+EVENT_BOUNCES_C2P_BL                            0x03 CBOX
+UMASK_BOUNCES_C2P_BL_SB                         0x01
+UMASK_BOUNCES_C2P_BL_NSB                        0x02
+UMASK_BOUNCES_C2P_BL_ALL                        0x03
+
+EVENT_BOUNCES_C2P_IV                            0x04 CBOX
+UMASK_BOUNCES_C2P_IV                            0x00
+
+EVENT_BOUNCES_P2C_AD                            0x01 CBOX
+UMASK_BOUNCES_P2C_AD_SB                         0x01
+UMASK_BOUNCES_P2C_AD_NSB                        0x02
+UMASK_BOUNCES_P2C_AD_ALL                        0x03
+
+EVENT_EGRESS_BYPASS_WINS                        0x0C CBOX
+UMASK_EGRESS_BYPASS_WINS_AD_BYP0                0x01
+UMASK_EGRESS_BYPASS_WINS_AD_BYP1                0x02
+UMASK_EGRESS_BYPASS_WINS_AK_BYP0                0x04
+UMASK_EGRESS_BYPASS_WINS_AK_BYP1                0x08
+UMASK_EGRESS_BYPASS_WINS_BL_BYP0                0x10
+UMASK_EGRESS_BYPASS_WINS_BL_BYP1                0x20
+UMASK_EGRESS_BYPASS_WINS_IV_BYP0                0x40
+UMASK_EGRESS_BYPASS_WINS_IV_BYP1                0x80
+
+EVENT_INGRESS_BYPASS_WINS_AD                    0x0E CBOX
+UMASK_INGRESS_BYPASS_WINS_AD_IRQ_BYP0           0x01
+UMASK_INGRESS_BYPASS_WINS_AD_IRQ_BYP1           0x02
+UMASK_INGRESS_BYPASS_WINS_AD_IPQ_BYP0           0x04
+UMASK_INGRESS_BYPASS_WINS_AD_IPQ_BYP1           0x08
+
+EVENT_MAF_ACK                                0x10 CBOX
+UMASK_MAF_ACK                                0x00
+
+EVENT_MAF_NACK1                                0x11 CBOX
+UMASK_MAF_NACK1_GO_PENDING                     0x01
+UMASK_MAF_NACK1_VIC_PENDING                    0x02
+UMASK_MAF_NACK1_SNP_PENDING                    0x04
+UMASK_MAF_NACK1_AC_PENDING                     0x08
+UMASK_MAF_NACK1_IDX_BLOCK                      0x10
+UMASK_MAF_NACK1_PA_BLOCK                       0x20
+UMASK_MAF_NACK1_IDLE_QPI                       0x40
+UMASK_MAF_NACK1_ALL_MAF_NACK1                  0x80
+UMASK_MAF_NACK1_TOTAL_MAF_NACKS                0xFF
+
+EVENT_MAF_NACK2                                0x12 CBOX
+UMASK_MAF_NACK2_MAF_FULL                       0x01
+UMASK_MAF_NACK2_EGRESS_FULL                    0x02
+UMASK_MAF_NACK2_VIQ_FULL                       0x04
+UMASK_MAF_NACK2_NO_TRACKER_CREDITS             0x08
+UMASK_MAF_NACK2_NO_S_FIFO_CREDITS              0x10
+UMASK_MAF_NACK2_NO_S_REQTBL_ENTRIES            0x20
+UMASK_MAF_NACK2_WB_PENDING                     0x40
+UMASK_MAF_NACK2_NACK2_ELSE                     0x80
+
+EVENT_OCCUPANCY_IPQ                            0x1A CBOX
+UMASK_OCCUPANCY_IPQ                            0x00
+
+EVENT_OCCUPANCY_IRQ                            0x18 CBOX
+UMASK_OCCUPANCY_IRQ                            0x00
+
+EVENT_OCCUPANCY_MAF                            0x1E CBOX
+UMASK_OCCUPANCY_MAF                            0x00
+
+EVENT_OCCUPANCY_RSPF                           0x22 CBOX
+UMASK_OCCUPANCY_RSPF                           0x00
+
+EVENT_OCCUPANCY_RWRF                           0x20 CBOX
+UMASK_OCCUPANCY_RWRF                           0x00
+
+EVENT_OCCUPANCY_VIQ                            0x1C CBOX
+UMASK_OCCUPANCY_VIQ                            0x00
+
+EVENT_SINKS_C2P                                0x06 CBOX
+UMASK_SINKS_C2P_IV                             0x01
+UMASK_SINKS_C2P_AK                             0x02
+UMASK_SINKS_C2P_BL                             0x04
+
+EVENT_SINKS_P2C                                0x05 CBOX
+UMASK_SINKS_P2C_IV                             0x01
+UMASK_SINKS_P2C_AK                             0x02
+UMASK_SINKS_P2C_BL                             0x04
+
+EVENT_SINKS_S2C                                0x07 CBOX
+UMASK_SINKS_S2C_AD                             0x01
+UMASK_SINKS_S2C_AK                             0x02
+UMASK_SINKS_S2C_BL                             0x04
+
+EVENT_SINKS_S2P_BL                            0x08 CBOX
+UMASK_SINKS_S2P_BL                            0x00
+
+EVENT_SNP_HITS                                  0x28 CBOX
+UMASK_SNP_HITS_REMOTE_RD_HITM                   0x01
+UMASK_SNP_HITS_REMOTE_RD_HITE                   0x02
+UMASK_SNP_HITS_REMOTE_RD_HITS                   0x04
+UMASK_SNP_HITS_REMOTE_RD_HITF                   0x08
+UMASK_SNP_HITS_REMOTE_RFO_HITM                  0x10
+UMASK_SNP_HITS_REMOTE_RFO_HITE                  0x20
+UMASK_SNP_HITS_REMOTE_RFO_HITS                  0x40
+UMASK_SNP_HITS_REMOTE_RFO_HITF                  0x80
+UMASK_SNP_HITS_REMOTE_HITM                      0x11
+UMASK_SNP_HITS_REMOTE_HITE                      0x22
+UMASK_SNP_HITS_REMOTE_HITS                      0x44
+UMASK_SNP_HITS_REMOTE_HITF                      0x88
+UMASK_SNP_HITS_REMOTE_ANY                       0xFF
+
+EVENT_SNPS                                      0x27 CBOX
+UMASK_SNPS_REMOTE_RD                            0x01
+UMASK_SNPS_REMOTE_RFO                           0x02
+UMASK_SNPS_REMOTE_ANY                           0x03
+
+EVENT_STARVED_EGRESS                            0x0B CBOX
+UMASK_STARVED_EGRESS_P2C_AD_SB                  0x01
+UMASK_STARVED_EGRESS_C2P_AD_SB                  0x02
+UMASK_STARVED_EGRESS_AD_SB                      0x03
+UMASK_STARVED_EGRESS_AD_NSB                     0x04
+UMASK_STARVED_EGRESS_AD                         0x07
+UMASK_STARVED_EGRESS_AK_SB                      0x08
+UMASK_STARVED_EGRESS_AK_NSB                     0x10
+UMASK_STARVED_EGRESS_AK                         0x18
+UMASK_STARVED_EGRESS_BL_SB                      0x20
+UMASK_STARVED_EGRESS_BL_NSB                     0x40
+UMASK_STARVED_EGRESS_BL                         0x60
+UMASK_STARVED_EGRESS_IV                         0x80
+
+EVENT_TRANS_IPQ                                 0x1B CBOX
+UMASK_TRANS_IPQ                                 0x00
+
+EVENT_TRANS_IRQ                                 0x19 CBOX
+UMASK_TRANS_IRQ                                 0x00
+
+EVENT_TRANS_MAF                                 0x1F CBOX
+UMASK_TRANS_MAF                                 0x00
+
+EVENT_TRANS_RSPF                                0x23 CBOX
+UMASK_TRANS_RSPF                                0x00
+
+EVENT_TRANS_RWRF                                0x21 CBOX
+UMASK_TRANS_RWRF                                0x00
+
+EVENT_TRANS_VIQ                                 0x1D CBOX
+UMASK_TRANS_VIQ                                 0x00
+
+EVENT_BUF_VALID_LOCAL_INT                        0x00  UBOX
+UMASK_BUF_VALID_LOCAL_INT                        0x00
+
+EVENT_BUF_VALID_REMOTE_INT                       0x01  UBOX
+UMASK_BUF_VALID_REMOTE_INT                       0x00
+
+EVENT_BUF_VALID_LOCK                             0x02  UBOX
+UMASK_BUF_VALID_LOCK                             0x00
+
+EVENT_BUF_VALID_STST                             0x03  UBOX
+UMASK_BUF_VALID_STST                             0x00
+
+EVENT_BUF_VALID_SPC_CYCLES                       0x04  UBOX
+UMASK_BUF_VALID_SPC_CYCLES                       0x00
+
+EVENT_CORRECTED_ERR                             0x1E4  UBOX
+UMASK_CORRECTED_ERR                             0x00
+
+EVENT_FATAL_ERR                                 0x1E6  UBOX
+UMASK_FATAL_ERR                                 0x00
+
+EVENT_IPIS_SENT                                  0xF9  UBOX
+UMASK_IPIS_SENT                                  0x00
+
+EVENT_RECOV                                     0x1DF  UBOX
+UMASK_RECOV                                      0x00
+
+EVENT_U2R_REQUESTS                               0x050  UBOX
+UMASK_U2R_REQUESTS                               0x00
+
+EVENT_U2R_REQUEST_CYCLES                         0x051  UBOX
+UMASK_U2R_REQUEST_CYCLES                         0x00
+
+EVENT_WOKEN                                      0xF8  UBOX
+UMASK_WOKEN                                      0x00
+
+EVENT_TO_R_B_HOM_MSGQ_CYCLES_FULL               0x03 SBOX
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_FULL_RBOX          0x01
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_FULL_BBOX          0x02
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_FULL_ALL           0x03
+
+EVENT_TO_R_B_HOM_MSGQ_CYCLES_NE                 0x06 SBOX
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_NE_RBOX            0x01
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_NE_BBOX            0x02
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_NE_ALL             0x03
+
+EVENT_TO_R_B_HOM_MSGQ_OCCUPANCY                 0x07 SBOX
+UMASK_TO_R_B_HOM_MSGQ_OCCUPANCY_RBOX            0x01
+UMASK_TO_R_B_HOM_MSGQ_OCCUPANCY_BBOX            0x02
+UMASK_TO_R_B_HOM_MSGQ_OCCUPANCY_ALL             0x03
+
+EVENT_B2S_DRS_BYPASS                            0x53 SBOX
+UMASK_B2S_DRS_BYPASS                            0x00
+
+EVENT_BBOX_CREDITS                              0x77 SBOX
+UMASK_BBOX_CREDITS                              0x00
+
+EVENT_BBOX_CREDIT_RETURNS                       0x6B SBOX
+UMASK_BBOX_CREDIT_RETURNS                       0x00
+
+EVENT_BBOX_HOM_BYPASS                           0x54 SBOX
+UMASK_BBOX_HOM_BYPASS                           0x00
+
+EVENT_EGRESS_BYPASS                             0x40 SBOX
+UMASK_EGRESS_BYPASS_AD_CW                      0x01
+UMASK_EGRESS_BYPASS_AD_CCW                     0x02
+UMASK_EGRESS_BYPASS_AD                         0x03
+UMASK_EGRESS_BYPASS_AK_CW                      0x04
+UMASK_EGRESS_BYPASS_AK_CCW                     0x08
+UMASK_EGRESS_BYPASS_AK                         0x0C
+UMASK_EGRESS_BYPASS_BL_CW                      0x10
+UMASK_EGRESS_BYPASS_BL_CCW                     0x20
+UMASK_EGRESS_BYPASS_BL                         0x30
+
+EVENT_EGRESS_ARB_WINS                           0x41 SBOX
+UMASK_EGRESS_ARB_WINS_AD_CW                   0x01
+UMASK_EGRESS_ARB_WINS_AD_CCW                  0x02
+UMASK_EGRESS_ARB_WINS_AD                      0x03
+UMASK_EGRESS_ARB_WINS_AK_CW                   0x04
+UMASK_EGRESS_ARB_WINS_AK_CCW                  0x08
+UMASK_EGRESS_ARB_WINS_AK                      0x0C
+UMASK_EGRESS_ARB_WINS_BL_CW                   0x10
+UMASK_EGRESS_ARB_WINS_BL_CCW                  0x20
+UMASK_EGRESS_ARB_WINS_BL                      0x30
+
+EVENT_EGRESS_ARB_LOSSES                         0x42 SBOX
+UMASK_EGRESS_ARB_LOSSES_AD_CW                   0x01
+UMASK_EGRESS_ARB_LOSSES_AD_CCW                  0x02
+UMASK_EGRESS_ARB_LOSSES_AD                      0x03
+UMASK_EGRESS_ARB_LOSSES_AK_CW                   0x04
+UMASK_EGRESS_ARB_LOSSES_AK_CCW                  0x08
+UMASK_EGRESS_ARB_LOSSES_AK                      0x0C
+UMASK_EGRESS_ARB_LOSSES_BL_CW                   0x10
+UMASK_EGRESS_ARB_LOSSES_BL_CCW                  0x20
+UMASK_EGRESS_ARB_LOSSES_BL                      0x30
+
+EVENT_EGRESS_STARVED                            0x43 SBOX
+UMASK_EGRESS_STARVED_AD_CW                      0x01
+UMASK_EGRESS_STARVED_AD_CCW                     0x02
+UMASK_EGRESS_STARVED_AD                         0x03
+UMASK_EGRESS_STARVED_AK_CW                      0x04
+UMASK_EGRESS_STARVED_AK_CCW                     0x08
+UMASK_EGRESS_STARVED_AK                         0x0C
+UMASK_EGRESS_STARVED_BL_CW                      0x10
+UMASK_EGRESS_STARVED_BL_CCW                     0x20
+UMASK_EGRESS_STARVED_BL                         0x30
+
+EVENT_FLITS_SENT_DRS                            0x65 SBOX
+UMASK_FLITS_SENT_DRS                            0x00
+
+EVENT_FLITS_SENT_NCB                            0x69 SBOX
+UMASK_FLITS_SENT_NCB                            0x00
+
+EVENT_FLITS_SENT_NCS                            0x67 SBOX
+UMASK_FLITS_SENT_NCS                            0x00
+
+EVENT_HALFLINE_BYPASS                           0x30 SBOX
+UMASK_HALFLINE_BYPASS                           0x00
+
+EVENT_NO_CREDIT_AD                              0x87 SBOX
+UMASK_NO_CREDIT_AD                              0x00
+
+EVENT_NO_CREDIT_AK                              0x88 SBOX
+UMASK_NO_CREDIT_AK                              0x00
+
+EVENT_NO_CREDIT_BL                              0x89 SBOX
+UMASK_NO_CREDIT_BL                              0x00
+
+EVENT_NO_CREDIT_HOM                             0x80 SBOX
+UMASK_NO_CREDIT_HOM                             0x00
+
+EVENT_NO_CREDIT_SNP                             0x81 SBOX
+UMASK_NO_CREDIT_SNP                             0x00
+
+EVENT_NO_CREDIT_DRS                             0x82 SBOX
+UMASK_NO_CREDIT_DRS                             0x00
+
+EVENT_NO_CREDIT_NCS                             0x83 SBOX
+UMASK_NO_CREDIT_NCS                             0x00
+
+EVENT_NO_CREDIT_NCB                             0x84 SBOX
+UMASK_NO_CREDIT_NCB                             0x00
+
+EVENT_NO_CREDIT_NDR                             0x85 SBOX
+UMASK_NO_CREDIT_NDR                             0x00
+
+EVENT_NO_CREDIT_IPQ                             0x8A SBOX
+UMASK_NO_CREDIT_IPQ                             0x00
+
+EVENT_NO_CREDIT_VNA                             0x86 SBOX
+UMASK_NO_CREDIT_VNA_RBOX                        0x01
+UMASK_NO_CREDIT_VNA_BBOX                        0x02
+UMASK_NO_CREDIT_VNA_ALL                         0x03
+
+EVENT_PKTS_RCVD_DRS_FROM_R                      0x72 SBOX
+UMASK_PKTS_RCVD_DRS_FROM_R                      0x00
+
+EVENT_PKTS_RCVD_DRS_FROM_B                      0x73 SBOX
+UMASK_PKTS_RCVD_DRS_FROM_B                      0x00
+
+EVENT_PKTS_RCVD_NCB                             0x75 SBOX
+UMASK_PKTS_RCVD_NCB                             0x00
+
+EVENT_PKTS_RCVD_NCS                             0x74 SBOX
+UMASK_PKTS_RCVD_NCS                             0x00
+
+EVENT_PKTS_RCVD_NDR                             0x70 SBOX
+UMASK_PKTS_RCVD_NDR                             0x00
+
+EVENT_PKTS_RCVD_SNP                             0x71 SBOX
+UMASK_PKTS_RCVD_SNP                             0x00
+
+EVENT_PKTS_SENT_DRS                             0x64 SBOX
+UMASK_PKTS_SENT_DRS_CBOX0_4                     0x01
+UMASK_PKTS_SENT_DRS_CBOX1_5                     0x02
+UMASK_PKTS_SENT_DRS_CBOX2_6                     0x04
+UMASK_PKTS_SENT_DRS_CBOX3_7                     0x08
+UMASK_PKTS_SENT_DRS_ALL                         0x0F
+
+EVENT_PKTS_SENT_HOM                             0x60 SBOX
+UMASK_PKTS_SENT_HOM_RBOX                        0x01
+UMASK_PKTS_SENT_HOM_BBOX                        0x02
+UMASK_PKTS_SENT_HOM_ALL                         0x03
+
+EVENT_PKTS_SENT_NCB                             0x68 SBOX
+UMASK_PKTS_SENT_NCB_CBOX0_4                     0x01
+UMASK_PKTS_SENT_NCB_CBOX1_5                     0x02
+UMASK_PKTS_SENT_NCB_CBOX2_6                     0x04
+UMASK_PKTS_SENT_NCB_CBOX3_7                     0x08
+UMASK_PKTS_SENT_NCB_ALL                         0x0F
+
+EVENT_PKTS_SENT_NCS                             0x66 SBOX
+UMASK_PKTS_SENT_NCS_CBOX0_4                     0x01
+UMASK_PKTS_SENT_NCS_CBOX1_5                     0x02
+UMASK_PKTS_SENT_NCS_CBOX2_6                     0x04
+UMASK_PKTS_SENT_NCS_CBOX3_7                     0x08
+UMASK_PKTS_SENT_NCS_ALL                         0x0F
+
+EVENT_PKTS_SENT_NDR                             0x63 SBOX
+UMASK_PKTS_SENT_NDR                             0x00
+
+EVENT_PKTS_SENT_SNP                             0x62 SBOX
+UMASK_PKTS_SENT_SNP                             0x00
+
+EVENT_RBOX_CREDIT_RETURNS                       0x6A SBOX
+UMASK_RBOX_CREDIT_RETURNS                       0x00
+
+EVENT_RBOX_CREDIT_CARRIERS                       0x76 SBOX
+UMASK_RBOX_CREDIT_CARRIERS                       0x00
+
+EVENT_RBOX_HOM_BYPASS                           0x50 SBOX
+UMASK_RBOX_HOM_BYPASS                           0x00
+
+EVENT_RBOX_SNP_BYPASS                           0x51 SBOX
+UMASK_RBOX_SNP_BYPASS_SNP                       0x01
+UMASK_RBOX_SNP_BYPASS_BIG_SNP                   0x02
+UMASK_RBOX_SNP_BYPASS_ALL                       0x03
+
+EVENT_REQ_TBL_OCCUPANCY                         0x31 SBOX
+UMASK_REQ_TBL_OCCUPANCY_LOCAL                   0x01
+UMASK_REQ_TBL_OCCUPANCY_REMOTE                  0x02
+UMASK_REQ_TBL_OCCUPANCY_ALL                     0x03
+
+EVENT_S2B_HOM_BYPASS                            0x52 SBOX
+UMASK_S2B_HOM_BYPASS                            0x00
+
+EVENT_TO_RING_B2S_MSGQ_CYCLES_FULL              0x2B SBOX
+UMASK_TO_RING_B2S_MSGQ_CYCLES_FULL              0x00
+
+EVENT_TO_RING_B2S_MSGQ_CYCLES_NE                0x2D SBOX
+UMASK_TO_RING_B2S_MSGQ_CYCLES_NE                0x00
+
+EVENT_TO_RING_B2S_MSGQ_OCCUPANCY                0x2F SBOX
+UMASK_TO_RING_B2S_MSGQ_OCCUPANCY                0x00
+
+EVENT_TO_RING_MSGQ_OCCUPANCY                    0x26 SBOX
+UMASK_TO_RING_MSGQ_OCCUPANCY_SNP                0x01
+UMASK_TO_RING_MSGQ_OCCUPANCY_NCS                0x02
+UMASK_TO_RING_MSGQ_OCCUPANCY_NCB                0x04
+UMASK_TO_RING_MSGQ_OCCUPANCY_ALL                0x07
+
+EVENT_TO_RING_NCB_MSGQ_CYCLES_FULL              0x21 SBOX
+UMASK_TO_RING_NCB_MSGQ_CYCLES_FULL              0x00
+
+EVENT_TO_RING_NCB_MSGQ_CYCLES_NE                0x24 SBOX
+UMASK_TO_RING_NCB_MSGQ_CYCLES_NE                0x00
+
+EVENT_TO_RING_NCS_MSGQ_CYCLES_FULL              0x22 SBOX
+UMASK_TO_RING_NCS_MSGQ_CYCLES_FULL              0x00
+
+EVENT_TO_RING_NDR_MSGQ_CYCLES_FULL              0x27 SBOX
+UMASK_TO_RING_NDR_MSGQ_CYCLES_FULL              0x00
+
+EVENT_TO_RING_NDR_MSGQ_CYCLES_NE                0x28 SBOX
+UMASK_TO_RING_NDR_MSGQ_CYCLES_NE                0x00
+
+EVENT_TO_RING_NDR_MSGQ_OCCUPANCY                0x29 SBOX
+UMASK_TO_RING_NDR_MSGQ_OCCUPANCY                0x00
+
+EVENT_TO_RING_R2S_MSGQ_CYCLES_FULL              0x2A SBOX
+UMASK_TO_RING_R2S_MSGQ_CYCLES_FULL              0x00
+
+EVENT_TO_RING_R2S_MSGQ_CYCLES_NE                0x2C SBOX
+UMASK_TO_RING_R2S_MSGQ_CYCLES_NE                0x00
+
+EVENT_TO_RING_R2S_MSGQ_OCCUPANCY                0x2E SBOX
+UMASK_TO_RING_R2S_MSGQ_OCCUPANCY                0x00
+
+EVENT_TO_RING_SNP_MSGQ_CYCLES_FULL              0x20 SBOX
+UMASK_TO_RING_SNP_MSGQ_CYCLES_FULL              0x00
+
+EVENT_TO_RING_SNP_MSGQ_CYCLES_NE                0x23 SBOX
+UMASK_TO_RING_SNP_MSGQ_CYCLES_NE                0x00
+
+EVENT_TO_R_DRS_MSGQ_CYCLES_FULL                 0x0E SBOX
+UMASK_TO_R_DRS_MSGQ_CYCLES_FULL_CBOX0_4         0x01
+UMASK_TO_R_DRS_MSGQ_CYCLES_FULL_CBOX1_5         0x02
+UMASK_TO_R_DRS_MSGQ_CYCLES_FULL_CBOX2_6         0x04
+UMASK_TO_R_DRS_MSGQ_CYCLES_FULL_CBOX3_7         0x08
+UMASK_TO_R_DRS_MSGQ_CYCLES_FULL_ALL             0x0F
+
+EVENT_TO_R_DRS_MSGQ_CYCLES_NE                   0x0F SBOX
+UMASK_TO_R_DRS_MSGQ_CYCLES_NE_CBOX0_4           0x01
+UMASK_TO_R_DRS_MSGQ_CYCLES_NE_CBOX1_5           0x02
+UMASK_TO_R_DRS_MSGQ_CYCLES_NE_CBOX2_6           0x04
+UMASK_TO_R_DRS_MSGQ_CYCLES_NE_CBOX3_7           0x08
+UMASK_TO_R_DRS_MSGQ_CYCLES_NE_ALL               0x0F
+
+EVENT_TO_R_DRS_MSGQ_OCCUPANCY                   0x10 SBOX
+UMASK_TO_R_DRS_MSGQ_OCCUPANCY_CBOX0_4           0x01
+UMASK_TO_R_DRS_MSGQ_OCCUPANCY_CBOX1_5           0x02
+UMASK_TO_R_DRS_MSGQ_OCCUPANCY_CBOX2_6           0x04
+UMASK_TO_R_DRS_MSGQ_OCCUPANCY_CBOX3_7           0x08
+UMASK_TO_R_DRS_MSGQ_OCCUPANCY_ALL               0x0F
+
+EVENT_TO_R_B_HOM_MSGQ_CYCLES_FULL               0x03 SBOX
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_FULL_RBOX          0x01
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_FULL_BBOX          0x02
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_FULL_ALL           0x03
+
+EVENT_TO_R_B_HOM_MSGQ_CYCLES_NE                 0x06 SBOX
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_NE_RBOX            0x01
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_NE_BBOX            0x02
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_NE_ALL             0x03
+
+EVENT_TO_R_B_HOM_MSGQ_OCCUPANCY                 0x07 SBOX
+UMASK_TO_R_B_HOM_MSGQ_OCCUPANCY_RBOX            0x01
+UMASK_TO_R_B_HOM_MSGQ_OCCUPANCY_BBOX            0x02
+UMASK_TO_R_B_HOM_MSGQ_OCCUPANCY_ALL             0x03
+
+EVENT_TO_R_NCB_MSGQ_CYCLES_FULL                 0x11 SBOX
+UMASK_TO_R_NCB_MSGQ_CYCLES_FULL_CBOX0_4         0x01
+UMASK_TO_R_NCB_MSGQ_CYCLES_FULL_CBOX1_5         0x02
+UMASK_TO_R_NCB_MSGQ_CYCLES_FULL_CBOX2_6         0x04
+UMASK_TO_R_NCB_MSGQ_CYCLES_FULL_CBOX3_7         0x08
+UMASK_TO_R_NCB_MSGQ_CYCLES_FULL_ALL             0x0F
+
+EVENT_TO_R_NCB_MSGQ_CYCLES_NE                   0x12 SBOX
+UMASK_TO_R_NCB_MSGQ_CYCLES_NE_CBOX0_4           0x01
+UMASK_TO_R_NCB_MSGQ_CYCLES_NE_CBOX1_5           0x02
+UMASK_TO_R_NCB_MSGQ_CYCLES_NE_CBOX2_6           0x04
+UMASK_TO_R_NCB_MSGQ_CYCLES_NE_CBOX3_7           0x08
+UMASK_TO_R_NCB_MSGQ_CYCLES_NE_ALL               0x0F
+
+EVENT_TO_R_NCB_MSGQ_OCCUPANCY                   0x13 SBOX
+UMASK_TO_R_NCB_MSGQ_OCCUPANCY_CBOX0_4           0x01
+UMASK_TO_R_NCB_MSGQ_OCCUPANCY_CBOX1_5           0x02
+UMASK_TO_R_NCB_MSGQ_OCCUPANCY_CBOX2_6           0x04
+UMASK_TO_R_NCB_MSGQ_OCCUPANCY_CBOX3_7           0x08
+UMASK_TO_R_NCB_MSGQ_OCCUPANCY_ALL               0x0F
+
+EVENT_TO_R_NCS_MSGQ_CYCLES_FULL                 0x14 SBOX
+UMASK_TO_R_NCS_MSGQ_CYCLES_FULL_CBOX0_4         0x01
+UMASK_TO_R_NCS_MSGQ_CYCLES_FULL_CBOX1_5         0x02
+UMASK_TO_R_NCS_MSGQ_CYCLES_FULL_CBOX2_6         0x04
+UMASK_TO_R_NCS_MSGQ_CYCLES_FULL_CBOX3_7         0x08
+UMASK_TO_R_NCS_MSGQ_CYCLES_FULL_ALL             0x0F
+
+EVENT_TO_R_NCS_MSGQ_CYCLES_NE                   0x15 SBOX
+UMASK_TO_R_NCS_MSGQ_CYCLES_NE_CBOX0_4           0x01
+UMASK_TO_R_NCS_MSGQ_CYCLES_NE_CBOX1_5           0x02
+UMASK_TO_R_NCS_MSGQ_CYCLES_NE_CBOX2_6           0x04
+UMASK_TO_R_NCS_MSGQ_CYCLES_NE_CBOX3_7           0x08
+UMASK_TO_R_NCS_MSGQ_CYCLES_NE_ALL               0x0F
+
+EVENT_TO_R_NCS_MSGQ_OCCUPANCY                   0x16 SBOX
+UMASK_TO_R_NCS_MSGQ_OCCUPANCY_CBOX0_4           0x01
+UMASK_TO_R_NCS_MSGQ_OCCUPANCY_CBOX1_5           0x02
+UMASK_TO_R_NCS_MSGQ_OCCUPANCY_CBOX2_6           0x04
+UMASK_TO_R_NCS_MSGQ_OCCUPANCY_CBOX3_7           0x08
+UMASK_TO_R_NCS_MSGQ_OCCUPANCY_ALL               0x0F
+
+EVENT_TO_R_NDR_MSGQ_CYCLES_FULL                 0x0B SBOX
+UMASK_TO_R_NDR_MSGQ_CYCLES_FULL                 0x00
+
+EVENT_TO_R_NDR_MSGQ_CYCLES_NE                   0x0C SBOX
+UMASK_TO_R_NDR_MSGQ_CYCLES_NE                   0x00
+
+EVENT_TO_R_NDR_MSGQ_OCCUPANCY                   0x0D SBOX
+UMASK_TO_R_NDR_MSGQ_OCCUPANCY                   0x00
+
+EVENT_TO_R_PROG_EV                              0x00 SBOX
+UMASK_TO_R_PROG_EV                              0x00
+
+EVENT_TO_R_B_REQUESTS                           0x6C SBOX
+UMASK_TO_R_B_REQUESTS_LOCAL                     0x01
+UMASK_TO_R_B_REQUESTS_REMOTE                    0x02
+UMASK_TO_R_B_REQUESTS_ALL                       0x03
+
+EVENT_TO_R_SNP_MSGQ_CYCLES_FULL                 0x08 SBOX
+UMASK_TO_R_SNP_MSGQ_CYCLES_FULL                 0x00
+
+EVENT_TO_R_SNP_MSGQ_CYCLES_NE                   0x09 SBOX
+UMASK_TO_R_SNP_MSGQ_CYCLES_NE                   0x00
+
+EVENT_TO_R_SNP_MSGQ_OCCUPANCY                   0x0A SBOX
+UMASK_TO_R_SNP_MSGQ_OCCUPANCY                   0x00
diff --git a/src/includes/perfmon_nehalem_counters.h b/src/includes/perfmon_nehalem_counters.h
index da61ea4..d3831c1 100644
--- a/src/includes/perfmon_nehalem_counters.h
+++ b/src/includes/perfmon_nehalem_counters.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Counter Header File of perfmon module for Nehalem.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_nehalem_events.txt b/src/includes/perfmon_nehalem_events.txt
index 2abe611..0eeed50 100644
--- a/src/includes/perfmon_nehalem_events.txt
+++ b/src/includes/perfmon_nehalem_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for Intel Nehalem
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_p6_events.txt b/src/includes/perfmon_p6_events.txt
index 4ebe03b..0db8338 100644
--- a/src/includes/perfmon_p6_events.txt
+++ b/src/includes/perfmon_p6_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for Pentium 3
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_phi.h b/src/includes/perfmon_phi.h
index 4dfddec..0f5dd54 100644
--- a/src/includes/perfmon_phi.h
+++ b/src/includes/perfmon_phi.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File of perfmon module for Xeon Phi.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -61,15 +61,14 @@ void perfmon_setupCounterThread_phi(
         PerfmonEvent* event,
         PerfmonCounterIndex index)
 {
-    uint64_t flags;
+    uint64_t flags = 0x0ULL;
     uint64_t reg = phi_counter_map[index].configRegister;
     int cpu_id = perfmon_threadData[thread_id].processorId;
+    perfmon_threadData[thread_id].counters[index].init = TRUE;
 
     if (phi_counter_map[index].type == PMC)
     {
-        perfmon_threadData[thread_id].counters[index].init = TRUE;
-        flags = msr_read(cpu_id,reg);
-        flags &= ~(0xFFFFU);
+        flags = (1<<22)|(1<<16);
 
         /* Intel with standard 8 bit event mask: [7:0] */
         flags |= (event->umask<<8) + event->eventId;
diff --git a/src/includes/perfmon_phi_counters.h b/src/includes/perfmon_phi_counters.h
index 7203cfa..edf0658 100644
--- a/src/includes/perfmon_phi_counters.h
+++ b/src/includes/perfmon_phi_counters.h
@@ -5,8 +5,8 @@
  *
  *      Description: Counter Header File of perfmon module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_phi_events.txt b/src/includes/perfmon_phi_events.txt
index 36f4167..d6393ba 100644
--- a/src/includes/perfmon_phi_events.txt
+++ b/src/includes/perfmon_phi_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for Intel Xeon Phi
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_pm.h b/src/includes/perfmon_pm.h
index 1821d0a..88346d1 100644
--- a/src/includes/perfmon_pm.h
+++ b/src/includes/perfmon_pm.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File of perfmon module Pentium M.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -58,12 +58,12 @@ void perfmon_init_pm(PerfmonThread *thread)
     msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
 
     /* Preinit of two PMC counters */
-    flags |= (1<<16);  /* user mode flag */
-    flags |= (1<<19);  /* pin control flag */
+    //flags |= (1<<16);  /* user mode flag */
+    //flags |= (1<<19);  /* pin control flag */
     //    flags |= (1<<22);  /* enable flag */
 
-    msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
+    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
+    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);*/
 }
 
 void perfmon_setupCounterThread_pm(
@@ -76,8 +76,7 @@ void perfmon_setupCounterThread_pm(
     int cpu_id = perfmon_threadData[thread_id].processorId;
 
     perfmon_threadData[thread_id].counters[index].init = TRUE;
-    flags = msr_read(cpu_id,reg);
-    flags &= ~(0xFFFFU); 
+    flags = (1<<16)|(1<<19);
 
     /* Intel with standard 8 bit event mask: [7:0] */
     flags |= (event->umask<<8) + event->eventId;
@@ -134,7 +133,7 @@ void perfmon_stopCountersThread_pm(int thread_id)
         if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
         {
             perfmon_threadData[thread_id].counters[i].counterData =
-				msr_read(cpu_id, pm_counter_map[i].counterRegister);
+                msr_read(cpu_id, pm_counter_map[i].counterRegister);
         }
     }
 }
diff --git a/src/includes/perfmon_pm_counters.h b/src/includes/perfmon_pm_counters.h
index 4d14f96..9119096 100644
--- a/src/includes/perfmon_pm_counters.h
+++ b/src/includes/perfmon_pm_counters.h
@@ -5,8 +5,8 @@
  *
  *      Description: Counter Header File of perfmon module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_pm_events.txt b/src/includes/perfmon_pm_events.txt
index 5765f05..9ed83a8 100644
--- a/src/includes/perfmon_pm_events.txt
+++ b/src/includes/perfmon_pm_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for Intel Pentium M
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_sandybridge.h b/src/includes/perfmon_sandybridge.h
index ec9687a..f11714a 100644
--- a/src/includes/perfmon_sandybridge.h
+++ b/src/includes/perfmon_sandybridge.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File of perfmon module for Sandy Bridge.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -67,13 +67,13 @@ void perfmon_init_sandybridge(PerfmonThread *thread)
     msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
 
     /* Preinit of PERFEVSEL registers */
-    flags |= (1<<22);  /* enable flag */
-    flags |= (1<<16);  /* user mode flag */
+    //flags |= (1<<22);  /* enable flag */
+    //flags |= (1<<16);  /* user mode flag */
 
-    msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
+    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
     msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
     msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);
+    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
 
     /* TODO Robust implementation which also works if stuff is not there */
     if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
@@ -180,12 +180,11 @@ void perfmon_init_sandybridge(PerfmonThread *thread)
                 LLU_CAST reg, \
                 LLU_CAST flags); \
     } \
-if(haveLock) { \
-    uflags = pci_read(cpu_id, channel, reg);  \
-    uflags &= ~(0xFFFFU);  \
-    uflags |= (event->umask<<8) + event->eventId;  \
-    pci_write(cpu_id, channel,  reg, uflags);  \
-}
+    if(haveLock) { \
+        uflags = (1<<22); \
+        uflags |= (event->umask<<8) + event->eventId;  \
+        pci_write(cpu_id, channel,  reg, uflags);  \
+    }
 
 
 void perfmon_setupCounterThread_sandybridge(
@@ -198,6 +197,8 @@ void perfmon_setupCounterThread_sandybridge(
     uint32_t uflags;
     uint64_t reg = sandybridge_counter_map[index].configRegister;
     int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
+    uint64_t orig_fixed_flags = fixed_flags;
     perfmon_threadData[thread_id].counters[index].init = TRUE;
 
     if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
@@ -209,8 +210,9 @@ void perfmon_setupCounterThread_sandybridge(
     {
         case PMC:
 
-            flags = msr_read(cpu_id,reg);
-            flags &= ~(0xFFFFU);   /* clear lower 16bits */
+            //flags = msr_read(cpu_id,reg);
+            //flags &= ~(0xFFFFU);   /* clear lower 16bits */
+            flags = (1<<22)|(1<<16);
 
             /* Intel with standard 8 bit event mask: [7:0] */
             flags |= (event->umask<<8) + event->eventId;
@@ -233,6 +235,7 @@ void perfmon_setupCounterThread_sandybridge(
             break;
 
         case FIXED:
+            fixed_flags |= (0x2 << (index*4));
             break;
 
         case POWER:
@@ -261,8 +264,9 @@ void perfmon_setupCounterThread_sandybridge(
             {
                 if(haveLock)
                 {
-                    uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0, reg);
-                    uflags &= ~(0xFFFFU);
+                    //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0, reg);
+                    //uflags &= ~(0xFFFFU);
+                    uflags = (1<<22);
                     uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
                     printf("UFLAGS 0x%x \n",uflags);
                     pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  reg, uflags);
@@ -294,8 +298,9 @@ void perfmon_setupCounterThread_sandybridge(
             {
                 if(haveLock)
                 {
-                    uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1, reg);
-                    uflags &= ~(0xFFFFU);
+                    //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1, reg);
+                    //uflags &= ~(0xFFFFU);
+                    uflags = (1<<22);
                     uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
                     pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  reg, uflags);
 
@@ -320,6 +325,10 @@ void perfmon_setupCounterThread_sandybridge(
             /* should never be reached */
             break;
     }
+    if (fixed_flags != orig_fixed_flags)
+    {
+        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+    }
 }
 
 void perfmon_startCountersThread_sandybridge(int thread_id)
@@ -392,7 +401,7 @@ void perfmon_startCountersThread_sandybridge(int thread_id)
                 case MBOXFIX:
                     if(haveLock)
                     {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_FIXED_CTL, 0x48000UL);
+                        pci_write(cpu_id, counter_map[i].device,  PCI_UNC_MC_PMON_FIXED_CTL, 0x48000UL);
                     }
                     break;
 
diff --git a/src/includes/perfmon_sandybridge_counters.h b/src/includes/perfmon_sandybridge_counters.h
index 8f709ba..afe9c04 100644
--- a/src/includes/perfmon_sandybridge_counters.h
+++ b/src/includes/perfmon_sandybridge_counters.h
@@ -5,8 +5,8 @@
  *
  *      Description: Counter header file of perfmon module for Sandy Bridge.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -28,8 +28,8 @@
  * =======================================================================================
  */
 
-#define NUM_COUNTERS_SANDYBRIDGE 69
-#define NUM_COUNTERS_UNCORE_SANDYBRIDGE 44
+#define NUM_COUNTERS_SANDYBRIDGE 32
+#define NUM_COUNTERS_UNCORE_SANDYBRIDGE 12
 #define NUM_COUNTERS_CORE_SANDYBRIDGE 8
 
 static PerfmonCounterMap sandybridge_counter_map[NUM_COUNTERS_SANDYBRIDGE] = {
@@ -46,69 +46,30 @@ static PerfmonCounterMap sandybridge_counter_map[NUM_COUNTERS_SANDYBRIDGE] = {
     {"TMP0", PMC7, THERMAL, 0, 0, 0, 0},
     /* RAPL counters */
     {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR1", PMC9, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR2", PMC10, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR3", PMC11, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    /* CBOX counters */
-    {"CBOX0C0", PMC12, CBOX0, MSR_UNC_C0_PMON_CTL0, MSR_UNC_C0_PMON_CTR0, 0, 0},
-    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_C0_PMON_CTL1, MSR_UNC_C0_PMON_CTR1, 0, 0},
-    {"CBOX0C2", PMC14, CBOX0, MSR_UNC_C0_PMON_CTL2, MSR_UNC_C0_PMON_CTR2, 0, 0},
-    {"CBOX0C3", PMC15, CBOX0, MSR_UNC_C0_PMON_CTL3, MSR_UNC_C0_PMON_CTR3, 0, 0},
-    {"CBOX1C0", PMC16, CBOX1, MSR_UNC_C1_PMON_CTL0, MSR_UNC_C1_PMON_CTR0, 0, 0},
-    {"CBOX1C1", PMC17, CBOX1, MSR_UNC_C1_PMON_CTL1, MSR_UNC_C1_PMON_CTR1, 0, 0},
-    {"CBOX1C2", PMC18, CBOX1, MSR_UNC_C1_PMON_CTL2, MSR_UNC_C1_PMON_CTR2, 0, 0},
-    {"CBOX1C3", PMC19, CBOX1, MSR_UNC_C1_PMON_CTL3, MSR_UNC_C1_PMON_CTR3, 0, 0},
-    {"CBOX2C0", PMC20, CBOX2, MSR_UNC_C2_PMON_CTL0, MSR_UNC_C2_PMON_CTR0, 0, 0},
-    {"CBOX2C1", PMC21, CBOX2, MSR_UNC_C2_PMON_CTL1, MSR_UNC_C2_PMON_CTR1, 0, 0},
-    {"CBOX2C2", PMC22, CBOX2, MSR_UNC_C2_PMON_CTL2, MSR_UNC_C2_PMON_CTR2, 0, 0},
-    {"CBOX2C3", PMC23, CBOX2, MSR_UNC_C2_PMON_CTL3, MSR_UNC_C2_PMON_CTR3, 0, 0},
-    {"CBOX3C0", PMC24, CBOX3, MSR_UNC_C3_PMON_CTL0, MSR_UNC_C3_PMON_CTR0, 0, 0},
-    {"CBOX3C1", PMC25, CBOX3, MSR_UNC_C3_PMON_CTL1, MSR_UNC_C3_PMON_CTR1, 0, 0},
-    {"CBOX3C2", PMC26, CBOX3, MSR_UNC_C3_PMON_CTL2, MSR_UNC_C3_PMON_CTR2, 0, 0},
-    {"CBOX3C3", PMC27, CBOX3, MSR_UNC_C3_PMON_CTL3, MSR_UNC_C3_PMON_CTR3, 0, 0},
-    {"CBOX4C0", PMC28, CBOX4, MSR_UNC_C4_PMON_CTL0, MSR_UNC_C4_PMON_CTR0, 0, 0},
-    {"CBOX4C1", PMC29, CBOX4, MSR_UNC_C4_PMON_CTL1, MSR_UNC_C4_PMON_CTR1, 0, 0},
-    {"CBOX4C2", PMC30, CBOX4, MSR_UNC_C4_PMON_CTL2, MSR_UNC_C4_PMON_CTR2, 0, 0},
-    {"CBOX4C3", PMC31, CBOX4, MSR_UNC_C4_PMON_CTL3, MSR_UNC_C4_PMON_CTR3, 0, 0},
-    {"CBOX5C0", PMC32, CBOX5, MSR_UNC_C5_PMON_CTL0, MSR_UNC_C5_PMON_CTR0, 0, 0},
-    {"CBOX5C1", PMC33, CBOX5, MSR_UNC_C5_PMON_CTL1, MSR_UNC_C5_PMON_CTR1, 0, 0},
-    {"CBOX5C2", PMC34, CBOX5, MSR_UNC_C5_PMON_CTL2, MSR_UNC_C5_PMON_CTR2, 0, 0},
-    {"CBOX5C3", PMC35, CBOX5, MSR_UNC_C5_PMON_CTL3, MSR_UNC_C5_PMON_CTR3, 0, 0},
-    {"CBOX6C0", PMC36, CBOX6, MSR_UNC_C6_PMON_CTL0, MSR_UNC_C6_PMON_CTR0, 0, 0},
-    {"CBOX6C1", PMC37, CBOX6, MSR_UNC_C6_PMON_CTL1, MSR_UNC_C6_PMON_CTR1, 0, 0},
-    {"CBOX6C2", PMC38, CBOX6, MSR_UNC_C6_PMON_CTL2, MSR_UNC_C6_PMON_CTR2, 0, 0},
-    {"CBOX6C3", PMC39, CBOX6, MSR_UNC_C6_PMON_CTL3, MSR_UNC_C6_PMON_CTR3, 0, 0},
-    {"CBOX7C0", PMC40, CBOX7, MSR_UNC_C7_PMON_CTL0, MSR_UNC_C7_PMON_CTR0, 0, 0},
-    {"CBOX7C1", PMC41, CBOX7, MSR_UNC_C7_PMON_CTL1, MSR_UNC_C7_PMON_CTR1, 0, 0},
-    {"CBOX7C2", PMC42, CBOX7, MSR_UNC_C7_PMON_CTL2, MSR_UNC_C7_PMON_CTR2, 0, 0},
-    {"CBOX7C3", PMC43, CBOX7, MSR_UNC_C7_PMON_CTL3, MSR_UNC_C7_PMON_CTR3, 0, 0},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0},
     /* IMC Counters: 4 48bit wide per memory channel, split in two reads */
-    {"MBOX0C0",PMC44, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX1C0",PMC45, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX2C0",PMC46, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX3C0",PMC47, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX0C1",PMC48, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX1C1",PMC49, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX2C1",PMC50, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX3C1",PMC51, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX0C2",PMC52, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX1C2",PMC53, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX2C2",PMC54, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX3C2",PMC55, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX0C3",PMC56, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX1C3",PMC57, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX2C3",PMC58, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX3C3",PMC59, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOXFIX",PMC60, MBOXFIX, 0, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_UNC_MC_PMON_FIXED_CTL},
-    /* QPI counters four 48bit  wide per port, split in two reads */
-    {"SBOX0P0",PMC61, SBOX0, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0},
-    {"SBOX1P0",PMC62, SBOX0, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0},
-    {"SBOX2P0",PMC63, SBOX0, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0},
-    {"SBOX3P0",PMC64, SBOX0, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0},
-    {"SBOX0P1",PMC65, SBOX1, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1},
-    {"SBOX1P1",PMC66, SBOX1, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1},
-    {"SBOX2P1",PMC67, SBOX1, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1},
-    {"SBOX3P1",PMC68, SBOX1, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1}
+    {"MBOX0C0",PMC12, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_0},
+    {"MBOX0C1",PMC13, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_0},
+    {"MBOX0C2",PMC14, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_0},
+    {"MBOX0C3",PMC15, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_0},
+    {"MBOX1C0",PMC16, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_1},
+    {"MBOX1C1",PMC17, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_1},
+    {"MBOX1C2",PMC18, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_1},
+    {"MBOX1C3",PMC19, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_1},
+    {"MBOX2C0",PMC20, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_2},
+    {"MBOX2C1",PMC21, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_2},
+    {"MBOX2C2",PMC22, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_2},
+    {"MBOX2C3",PMC23, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_2},
+    {"MBOX3C0",PMC24, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_3},
+    {"MBOX3C1",PMC25, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_3},
+    {"MBOX3C2",PMC26, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_3},
+    {"MBOX3C3",PMC27, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_3},
+    {"MBOX0FIX",PMC28, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_0},
+    {"MBOX1FIX",PMC29, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_1},
+    {"MBOX2FIX",PMC30, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_2},
+    {"MBOX3FIX",PMC31, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_3},
 };
 
 
diff --git a/src/includes/perfmon_sandybridge_events.txt b/src/includes/perfmon_sandybridge_events.txt
index 72f6009..ec4d397 100644
--- a/src/includes/perfmon_sandybridge_events.txt
+++ b/src/includes/perfmon_sandybridge_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for Intel SandyBridge
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
@@ -183,7 +183,10 @@ UMASK_IDQ_MS_MITE_UOPS  0x20
 UMASK_IDQ_MS_UOPS       0x30
 
 EVENT_ICACHE                  0x80   PMC
+UMASK_ICACHE_HITS             0x01
 UMASK_ICACHE_MISSES             0x02
+UMASK_ICACHE_ACCESSES           0x03
+UMASK_ICACHE_IFETCH_STALL       0x04
 
 EVENT_ITLB_MISSES                 0x85      PMC
 UMASK_ITLB_MISSES_CAUSES_A_WALK   0x01
@@ -342,11 +345,17 @@ UMASK_MEM_UOP_RETIRED_STORES_LOCK             0x22
 UMASK_MEM_UOP_RETIRED_LOADS_SPLIT             0x41
 UMASK_MEM_UOP_RETIRED_STORES_SPLIT            0x42
 
-EVENT_MEMLOAD_UOPS_RETIRED               0xD1   PMC
-UMASK_MEMLOAD_UOPS_RETIRED_L1_HIT       0x01
-UMASK_MEMLOAD_UOPS_RETIRED_L2_HIT       0x02
-UMASK_MEMLOAD_UOPS_RETIRED_LLC_HIT      0x04
-UMASK_MEMLOAD_UOPS_RETIRED_HIT_LFB      0x40
+EVENT_MEM_LOAD_UOPS_RETIRED               0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
 
 EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED               0xD2   PMC
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS         0x01
@@ -406,243 +415,7 @@ EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED          0xD3  PMC
 UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM           0x01
 UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_DRAM            0x04
 
-EVENT_CBO_CLOCKTICKS                         0x00  CBOX
-UMASK_CBO_CLOCKTICKS                         0x00
-
-EVENT_COUNTER0_OCCUPANCY              0x1F  CBOX0
-UMASK_COUNTER0_OCCUPANCY              0x00
-
-EVENT_ISMQ_DRD_MISS_OCC              0x21  CBOX0|CBOX1
-UMASK_ISMQ_DRD_MISS_OCC              0x00
-
-EVENT_LLC_LOOKUP              0x34  CBOX0|CBOX1
-UMASK_LLC_LOOKUP_DATA_READ          0x03
-UMASK_LLC_LOOKUP_WRITE              0x05
-UMASK_LLC_LOOKUP_REMOTE_SNOOP       0x09
-UMASK_LLC_LOOKUP_NID                0x41
-
-EVENT_LLC_VICTIMS              0x37  CBOX0|CBOX1
-UMASK_LLC_VICTIMS_M_STATE      0x01
-UMASK_LLC_VICTIMS_E_STATE      0x02
-UMASK_LLC_VICTIMS_S_STATE      0x04
-UMASK_LLC_VICTIMS_MISS         0x08
-UMASK_LLC_VICTIMS_NID          0x40
-
-EVENT_CBO_MISC              0x39  CBOX0|CBOX1
-UMASK_CBO_MISC_RSPI_WAS_FSE      0x01
-UMASK_CBO_MISC_WC_ALIASING       0x02
-UMASK_CBO_MISC_STARTED           0x04
-UMASK_CBO_MISC_RFO_HIT_S         0x08
-
-EVENT_RING_AD_USED              0x1B  CBOX2|CBOX3
-UMASK_RING_AD_USED_UP_EVEN      0x01
-UMASK_RING_AD_USED_UP_ODD       0x02
-UMASK_RING_AD_USED_DOWN_EVEN    0x04
-UMASK_RING_AD_USED_DOWN_ODD     0x08
-
-EVENT_RING_AK_USED              0x1C  CBOX2|CBOX3
-UMASK_RING_AK_USED_UP_EVEN      0x01
-UMASK_RING_AK_USED_UP_ODD       0x02
-UMASK_RING_AK_USED_DOWN_EVEN    0x04
-UMASK_RING_AK_USED_DOWN_ODD     0x08
-
-EVENT_RING_BL_USED              0x1D  CBOX2|CBOX3
-UMASK_RING_BL_USED_UP_EVEN      0x01
-UMASK_RING_BL_USED_UP_ODD       0x02
-UMASK_RING_BL_USED_DOWN_EVEN    0x04
-UMASK_RING_BL_USED_DOWN_ODD     0x08
-
-EVENT_RING_BOUNCES              0x05  CBOX0|CBOX1
-UMASK_RING_BOUNCES_AK_CORE      0x02
-UMASK_RING_BOUNCES_BL_CORE      0x04
-UMASK_RING_BOUNCES_IV_CORE      0x08
-
-EVENT_RING_IV_USED              0x1E  CBOX2|CBOX3
-UMASK_RING_IV_USED_ANY          0x0F
-
-EVENT_RING_SRC_THRTL            0x05  CBOX0|CBOX1
-UMASK_RING_SRC_THRTL            0x07
-
-EVENT_RXR_EXT_STARVED               0x12  CBOX0|CBOX1
-UMASK_RXR_EXT_STARVED_IRQ           0x01
-UMASK_RXR_EXT_STARVED_IPQ           0x02
-UMASK_RXR_EXT_STARVED_ISMQ          0x04
-UMASK_RXR_EXT_STARVED_ISMQ_BIDS     0x08
-
-EVENT_RXR_INSERTS                0x13  CBOX0|CBOX1
-UMASK_RXR_INSERTS_IRQ            0x01
-UMASK_RXR_INSERTS_IRQ_REJECTED   0x02
-UMASK_RXR_INSERTS_IPQ            0x04
-UMASK_RXR_INSERTS_VFIFO          0x10
-
-EVENT_RXR_IPQ_RETRY                0x31  CBOX0|CBOX1
-UMASK_RXR_IPQ_RETRY_ANY            0x01
-UMASK_RXR_IPQ_RETRY_FULL           0x02
-UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT  0x04
-UMASK_RXR_IPQ_RETRY_QPI_CREDITS    0x10
-
-EVENT_RXR_IRQ_RETRY                0x32  CBOX0|CBOX1
-UMASK_RXR_IRQ_RETRY_ANY            0x01
-UMASK_RXR_IRQ_RETRY_FULL           0x02
-UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT  0x04
-UMASK_RXR_IRQ_RETRY_RTID           0x08
-UMASK_RXR_IRQ_RETRY_QPI_CREDITS    0x10
-
-EVENT_RXR_ISMQ_RETRY                0x33  CBOX0|CBOX1
-UMASK_RXR_ISMQ_RETRY_ANY            0x01
-UMASK_RXR_ISMQ_RETRY_FULL           0x02
-UMASK_RXR_ISMQ_RETRY_ADDR_CONFLICT  0x04
-UMASK_RXR_ISMQ_RETRY_RTID           0x08
-UMASK_RXR_ISMQ_RETRY_QPI_CREDITS    0x10
-
-EVENT_RXR_OCCUPANCY                0x11  CBOX0
-UMASK_RXR_OCCUPANCY_IRQ            0x01
-UMASK_RXR_OCCUPANCY_IRQ_REJECTED   0x02
-UMASK_RXR_OCCUPANCY_IPQ            0x04
-UMASK_RXR_OCCUPANCY_VIFO           0x10
-
-EVENT_TOR_INSERTS                    0x35  CBOX1
-UMASK_TOR_INSERTS_OPCODE             0x01
-UMASK_TOR_INSERTS_EVICTION           0x04
-UMASK_TOR_INSERTS_WB                 0x10
-UMASK_TOR_INSERTS_MISS_OPCODE        0x03
-UMASK_TOR_INSERTS_MISS_ALL           0x0A
-UMASK_TOR_INSERTS_NID_OPCODE         0x41
-UMASK_TOR_INSERTS_NID_EVICTION       0x44
-UMASK_TOR_INSERTS_NID_ALL            0x48
-UMASK_TOR_INSERTS_NID_WB             0x50
-UMASK_TOR_INSERTS_NID_MISS_OPCODE    0x43
-UMASK_TOR_INSERTS_NID_MISS_ALL       0x4A
-
-EVENT_TOR_OCCUPANCY                    0x36  CBOX0
-UMASK_TOR_OCCUPANCY_OPCODE             0x01
-UMASK_TOR_OCCUPANCY_EVICTION           0x04
-UMASK_TOR_OCCUPANCY_ALL                0x08
-UMASK_TOR_OCCUPANCY_MISS_OPCODE        0x03
-UMASK_TOR_OCCUPANCY_MISS_ALL           0x0A
-UMASK_TOR_OCCUPANCY_NID_OPCODE         0x41
-UMASK_TOR_OCCUPANCY_NID_EVICTION       0x44
-UMASK_TOR_OCCUPANCY_NID_ALL            0x48
-UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE    0x43
-UMASK_TOR_OCCUPANCY_NID_MISS_ALL       0x4A
-
-EVENT_TXT_ADS_USED                0x04  CBOX0|CBOX1
-UMASK_TXT_ADS_USED            0x00
-
-EVENT_TXT_INSERTS                0x02  CBOX0|CBOX1
-UMASK_TXT_INSERTS_AD_CACHE            0x01
-UMASK_TXT_INSERTS_AK_CACHE            0x02
-UMASK_TXT_INSERTS_BL_CACHE            0x04
-UMASK_TXT_INSERTS_IV_CACHE            0x08
-UMASK_TXT_INSERTS_AD_CORE             0x10
-UMASK_TXT_INSERTS_AK_CORE             0x20
-UMASK_TXT_INSERTS_BL_CORE             0x40
-
-EVENT_HA_CLOCKTICKS                0x00  BBOX
-UMASK_HA_CLOCKTICKS                0x00
-
-EVENT_CONFLICT_CYCLES                0x0B  BBOX
-UMASK_CONFLICT_CYCLES_NO_CONFLICT    0x01
-UMASK_CONFLICT_CYCLES_CONFLICT       0x02
-
-EVENT_DIRECT2CORE_COUNT                0x11  BBOX
-UMASK_DIRECT2CORE_COUNT                0x00
-
-EVENT_DIRECT2CORE_CYCLES_DISABLED      0x12  BBOX
-UMASK_DIRECT2CORE_CYCLES_DISABLED      0x00
-
-EVENT_DIRECT2CORE_TXN_OVERRIDE         0x13  BBOX
-UMASK_DIRECT2CORE_TXN_OVERRIDE         0x00
-
-EVENT_DIRECTORY_LOOKUP             0x0C  BBOX
-UMASK_DIRECTORY_LOOKUP_SNP         0x01
-UMASK_DIRECTORY_LOOKUP_NO_SNP      0x02
-
-EVENT_DIRECTORY_UPDATE             0x0D  BBOX
-UMASK_DIRECTORY_UPDATE_SET         0x01
-UMASK_DIRECTORY_UPDATE_CLEAR       0x02
-UMASK_DIRECTORY_UPDATE_ANY         0x03
-
-EVENT_IGR_NO_CREDIT_CYCLES             0x22  BBOX
-UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0     0x01
-UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1     0x02
-UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0     0x04
-UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1     0x08
-
-EVENT_IMC_RETRY     0x1E  BBOX
-UMASK_IMC_RETRY     0x00
-
-EVENT_IMC_WRITES                   0x1A  BBOX
-UMASK_IMC_WRITES_FULL              0x01
-UMASK_IMC_WRITES_PARTIAL           0x02
-UMASK_IMC_WRITES_FULL_ISOCH        0x04
-UMASK_IMC_WRITES_PARTIAL_ISOCH     0x08
-UMASK_IMC_WRITES_ALL               0x0F
-
-EVENT_REQUESTS                   0x01  BBOX
-UMASK_REQUESTS_READS             0x03
-UMASK_REQUESTS_WRITES            0x0C
-
-EVENT_RPQ_CYCLES_NO_REG_CREDITS           0x15  BBOX
-UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0      0x01
-UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1      0x02
-UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2      0x04
-UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3      0x08
-UMASK_RPQ_CYCLES_NO_REG_CREDITS_ALL       0x0F
-
-EVENT_TAD_REQUESTS_G0               0x1B  BBOX
-UMASK_TAD_REQUESTS_G0_REGION_0      0x01
-UMASK_TAD_REQUESTS_G0_REGION_1      0x02
-UMASK_TAD_REQUESTS_G0_REGION_2      0x04
-UMASK_TAD_REQUESTS_G0_REGION_3      0x08
-UMASK_TAD_REQUESTS_G0_REGION_4      0x10
-UMASK_TAD_REQUESTS_G0_REGION_5      0x20
-UMASK_TAD_REQUESTS_G0_REGION_6      0x40
-UMASK_TAD_REQUESTS_G0_REGION_7      0x80
-
-EVENT_TAD_REQUESTS_G1               0x1C  BBOX
-UMASK_TAD_REQUESTS_G1_REGION_8      0x01
-UMASK_TAD_REQUESTS_G1_REGION_9      0x02
-UMASK_TAD_REQUESTS_G1_REGION_10      0x04
-UMASK_TAD_REQUESTS_G1_REGION_11      0x08
-
-EVENT_TRACKER_INSERTS                   0x06  BBOX
-UMASK_TRACKER_INSERTS_ALL             0x03
-
-EVENT_TXR_AD                   0x0F  BBOX
-UMASK_TXR_AD_NDR             0x01
-UMASK_TXR_AD_SNP             0x02
-
-EVENT_TXR_AD_CYCLES_FULL                  0x2A  BBOX
-UMASK_TXR_AD_CYCLES_FULL_SCHED0           0x01
-UMASK_TXR_AD_CYCLES_FULL_SCHED1           0x02
-UMASK_TXR_AD_CYCLES_FULL_ALL              0x03
-
-EVENT_TXR_AK_CYCLES_FULL                  0x32  BBOX
-UMASK_TXR_AK_CYCLES_FULL_SCHED0           0x01
-UMASK_TXR_AK_CYCLES_FULL_SCHED1           0x02
-UMASK_TXR_AK_CYCLES_FULL_ALL              0x03
-
-EVENT_TXR_AK_NDR              0x0E  BBOX
-UMASK_TXR_AK_NDR              0x00
-
-EVENT_TXR_BL              0x10  BBOX
-UMASK_TXR_BL_DRS_CACHE    0x01
-UMASK_TXR_BL_DRS_CORE     0x02
-UMASK_TXR_BL_DRS_QPI      0x04
-
-EVENT_TXR_BL_CYCLES_FULL                  0x36  BBOX
-UMASK_TXR_BL_CYCLES_FULL_SCHED0           0x01
-UMASK_TXR_BL_CYCLES_FULL_SCHED1           0x02
-UMASK_TXR_BL_CYCLES_FULL_ALL              0x03
-
-EVENT_WPQ_CYCLES_NO_REG_CREDITS                0x18  BBOX
-UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0           0x01
-UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN1           0x02
-UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN2           0x04
-UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN3           0x08
-
-EVENT_DRAM_CLOCKTICKS             0x00  MBOXFIX
+EVENT_DRAM_CLOCKTICKS             0x00  MBOX0FIX|MBOX1FIX|MBOX2FIX|MBOX3FIX
 UMASK_DRAM_CLOCKTICKS             0x00
 
 EVENT_ACT_COUNT                  0x01  MBOX
@@ -742,437 +515,3 @@ UMASK_WPQ_READ_HIT           0x00
 
 EVENT_WPQ_WRITE_HIT           0x24  MBOX
 UMASK_WPQ_WRITE_HIT           0x00
-
-EVENT_PCU_CLOCKTICKS           0x00  WBOX
-UMASK_PCU_CLOCKTICKS           0x00
-
-EVENT_CORE0_TRANSITION_CYCLES           0x03  WBOX
-UMASK_CORE0_TRANSITION_CYCLES           0x00
-
-EVENT_CORE1_TRANSITION_CYCLES           0x04  WBOX
-UMASK_CORE1_TRANSITION_CYCLES           0x00
-
-EVENT_CORE2_TRANSITION_CYCLES           0x05  WBOX
-UMASK_CORE2_TRANSITION_CYCLES           0x00
-
-EVENT_CORE3_TRANSITION_CYCLES           0x06  WBOX
-UMASK_CORE3_TRANSITION_CYCLES           0x00
-
-EVENT_CORE4_TRANSITION_CYCLES           0x07  WBOX
-UMASK_CORE4_TRANSITION_CYCLES           0x00
-
-EVENT_CORE5_TRANSITION_CYCLES           0x08  WBOX
-UMASK_CORE5_TRANSITION_CYCLES           0x00
-
-EVENT_CORE6_TRANSITION_CYCLES           0x09  WBOX
-UMASK_CORE6_TRANSITION_CYCLES           0x00
-
-EVENT_CORE7_TRANSITION_CYCLES           0x0A  WBOX
-UMASK_CORE7_TRANSITION_CYCLES           0x00
-
-EVENT_DEMOTIONS_CORE0           0x1E  WBOX
-UMASK_DEMOTIONS_CORE0           0x00
-
-EVENT_DEMOTIONS_CORE1           0x1F  WBOX
-UMASK_DEMOTIONS_CORE1           0x00
-
-EVENT_DEMOTIONS_CORE2           0x20  WBOX
-UMASK_DEMOTIONS_CORE2           0x00
-
-EVENT_DEMOTIONS_CORE3           0x21  WBOX
-UMASK_DEMOTIONS_CORE3           0x00
-
-EVENT_DEMOTIONS_CORE4           0x22  WBOX
-UMASK_DEMOTIONS_CORE4           0x00
-
-EVENT_DEMOTIONS_CORE5           0x23  WBOX
-UMASK_DEMOTIONS_CORE5           0x00
-
-EVENT_DEMOTIONS_CORE6           0x24  WBOX
-UMASK_DEMOTIONS_CORE6           0x00
-
-EVENT_DEMOTIONS_CORE7           0x25  WBOX
-UMASK_DEMOTIONS_CORE7           0x00
-
-EVENT_FREQ_BAND0_CYCLES           0x0B  WBOX
-UMASK_FREQ_BAND0_CYCLES           0x00
-
-EVENT_FREQ_BAND1_CYCLES           0x0C  WBOX
-UMASK_FREQ_BAND1_CYCLES           0x00
-
-EVENT_FREQ_BAND2_CYCLES           0x0D  WBOX
-UMASK_FREQ_BAND2_CYCLES           0x00
-
-EVENT_FREQ_BAND3_CYCLES           0x0E  WBOX
-UMASK_FREQ_BAND3_CYCLES           0x00
-
-EVENT_FREQ_MAX_CURRENT_CYCLES           0x07  WBOX
-UMASK_FREQ_MAX_CURRENT_CYCLES           0x00
-
-EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES           0x04  WBOX
-UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES           0x00
-
-EVENT_FREQ_MAX_POWER_CYCLES           0x05  WBOX
-UMASK_FREQ_MAX_POWER_CYCLES           0x00
-
-EVENT_FREQ_MAX_OS_CYCLES           0x06  WBOX
-UMASK_FREQ_MAX_OS_CYCLES           0x00
-
-EVENT_FREQ_MIN_IO_P_CYCLES           0x01  WBOX
-UMASK_FREQ_MIN_IO_P_CYCLES           0x00
-
-EVENT_FREQ_MIN_PERF_P_CYCLES           0x02  WBOX
-UMASK_FREQ_MIN_PERF_P_CYCLES           0x00
-
-EVENT_FREQ_TRANS_CYCLES           0x00  WBOX
-UMASK_FREQ_TRANS_CYCLES           0x00
-
-EVENT_MEMORY_PHASE_SHEDDING_CYCLES           0x2F  WBOX
-UMASK_MEMORY_PHASE_SHEDDING_CYCLES           0x00
-
-EVENT_POWER_STATE_OCCUPANCY           0x80  WBOX
-UMASK_POWER_STATE_OCCUPANCY_CORES_C0           0x40
-UMASK_POWER_STATE_OCCUPANCY_CORES_C3           0x80
-UMASK_POWER_STATE_OCCUPANCY_CORES_C6           0xC0
-
-EVENT_PROCHOT_EXTERNAL_CYCLES           0x0A  WBOX
-UMASK_PROCHOT_EXTERNAL_CYCLES           0x00
-
-EVENT_PROCHOT_INTERNAL_CYCLES           0x09  WBOX
-UMASK_PROCHOT_INTERNAL_CYCLES           0x00
-
-EVENT_TOTAL_TRANSITION_CYCLES           0x0B  WBOX
-UMASK_TOTAL_TRANSITION_CYCLES           0x00
-
-EVENT_VOLT_TRANS_CYCLES_CHANGE           0x03  WBOX
-UMASK_VOLT_TRANS_CYCLES_CHANGE           0x00
-
-EVENT_VOLT_TRANS_CYCLES_DECREASE           0x02  WBOX
-UMASK_VOLT_TRANS_CYCLES_DECREASE           0x00
-
-EVENT_VOLT_TRANS_CYCLES_INCREASE           0x01  WBOX
-UMASK_VOLT_TRANS_CYCLES_INCREASE           0x00
-
-EVENT_VR_HOT_CYCLES           0x32  WBOX
-UMASK_VR_HOT_CYCLES           0x00
-
-EVENT_QPI_LL_RATE           0xFF  SBOX
-UMASK_QPI_LL_RATE           0x00
-
-
-EVENT_QPI_LL_CLOCKTICKS           0x14  SBOX
-UMASK_QPI_LL_CLOCKTICKS           0x00
-
-EVENT_CTO_COUNT           0x38  SBOX
-UMASK_CTO_COUNT_ANY_DATAC_TO_NODE0    0x1C 0xF8 0x00
-UMASK_CTO_COUNT_ANY_DATAC_TO_NODE1    0x1C 0xF8 0x01
-UMASK_CTO_COUNT_ANY_DATAC_TO_NODE2    0x1C 0xF8 0x02
-UMASK_CTO_COUNT_ANY_DATAC_TO_NODE3    0x1C 0xF8 0x03
-UMASK_CTO_COUNT_WRITE_TO_NODE0        0x1C 0xE0 0x00
-UMASK_CTO_COUNT_WRITE_TO_NODE1        0x1C 0xE0 0x01
-UMASK_CTO_COUNT_WRITE_TO_NODE2        0x1C 0xE0 0x02
-UMASK_CTO_COUNT_WRITE_TO_NODE3        0x1C 0xE0 0x03
-UMASK_CTO_COUNT_NCB_ANY_TO_NODE0      0x18 0xE0 0x00
-UMASK_CTO_COUNT_NCB_ANY_TO_NODE1      0x18 0xE0 0x01
-UMASK_CTO_COUNT_NCB_ANY_TO_NODE2      0x18 0xE0 0x02
-UMASK_CTO_COUNT_NCB_ANY_TO_NODE3      0x18 0xE0 0x03
-UMASK_CTO_COUNT_NCB_INTR_TO_NODE0     0x19 0xF8 0x00
-UMASK_CTO_COUNT_NCB_INTR_TO_NODE1     0x19 0xF8 0x01
-UMASK_CTO_COUNT_NCB_INTR_TO_NODE2     0x19 0xF8 0x02
-UMASK_CTO_COUNT_NCB_INTR_TO_NODE3     0x19 0xF8 0x03
-
-EVENT_DIRECT2CORE           0x13  SBOX
-UMASK_DIRECT2CORE_SUCCESS             0x01
-UMASK_DIRECT2CORE_FAILURE_CREDITS     0x02
-UMASK_DIRECT2CORE_FAILURE_RBT         0x04
-UMASK_DIRECT2COREFAILURE_CREDIRTS_RBT 0x08
-
-EVENT_L1_POWER_CYCLES           0x12  SBOX
-UMASK_L1_POWER_CYCLES           0x00
-
-EVENT_RXL0P_POWER_CYCLES           0x10  SBOX
-UMASK_RXL0P_POWER_CYCLES           0x00
-
-EVENT_RXL0_POWER_CYCLES           0x0F  SBOX
-UMASK_RXL0_POWER_CYCLES           0x00
-
-EVENT_RXL_BYPASSED           0x09  SBOX
-UMASK_RXL_BYPASSED           0x00
-
-EVENT_RXL_CREDITS_CONSUMED_VN0           0x1E  SBOX
-UMASK_RXL_CREDITS_CONSUMED_VN0_DRS       0x01
-UMASK_RXL_CREDITS_CONSUMED_VN0_NCB       0x02
-UMASK_RXL_CREDITS_CONSUMED_VN0_NCS       0x04
-UMASK_RXL_CREDITS_CONSUMED_VN0_HOM       0x08
-UMASK_RXL_CREDITS_CONSUMED_VN0_SNP       0x10
-UMASK_RXL_CREDITS_CONSUMED_VN0_NDR       0x20
-
-EVENT_RXL_CREDITS_CONSUMED_VNA           0x1D  SBOX
-UMASK_RXL_CREDITS_CONSUMED_VNA           0x00
-
-EVENT_RXL_FLITS_G0              0x01  SBOX
-UMASK_RXL_FLITS_G0_IDLE         0x01
-UMASK_RXL_FLITS_G0_DATA         0x02
-UMASK_RXL_FLITS_G0_NON_DATA     0x04
-
-EVENT_RXL_FLITS_G1              0x02  SBOX
-UMASK_RXL_FLITS_G1_SNP          0x01
-UMASK_RXL_FLITS_G1_HOM_REQ      0x02
-UMASK_RXL_FLITS_G1_HOM_NONREQ   0x04
-UMASK_RXL_FLITS_G1_HOM          0x06
-UMASK_RXL_FLITS_G1_DRS_DATA     0x08
-UMASK_RXL_FLITS_G1_DRS_NONDATA  0x10
-UMASK_RXL_FLITS_G1_DRS          0x60
-
-EVENT_RXL_FLITS_G2              0x03  SBOX
-UMASK_RXL_FLITS_G2_NDR_AD       0x01
-UMASK_RXL_FLITS_G2_NDR_AK       0x02
-UMASK_RXL_FLITS_G2_NCB_DATA     0x04
-UMASK_RXL_FLITS_G2_NCB_NODATA   0x08
-UMASK_RXL_FLITS_G2_NCB          0x06
-UMASK_RXL_FLITS_G2_NCS          0x10
-
-EVENT_RXL_INSERTS           0x08  SBOX
-UMASK_RXL_INSERTS           0x00
-
-EVENT_RXL_INSERTS_DRS           0x09  SBOX
-UMASK_RXL_INSERTS_DRS           0x00
-
-EVENT_RXL_INSERTS_HOM           0x0C  SBOX
-UMASK_RXL_INSERTS_HOM           0x00
-
-EVENT_RXL_INSERTS_NCB           0x0A  SBOX
-UMASK_RXL_INSERTS_NCB           0x00
-
-EVENT_RXL_INSERTS_NCS           0x0B  SBOX
-UMASK_RXL_INSERTS_NCS           0x00
-
-EVENT_RXL_INSERTS_NDR           0x0E  SBOX
-UMASK_RXL_INSERTS_NDR           0x00
-
-EVENT_RXL_INSERTS_SNP           0x0D  SBOX
-UMASK_RXL_INSERTS_SNP           0x00
-
-EVENT_RXL_OCCUPANCY           0x0B  SBOX
-UMASK_RXL_OCCUPANCY           0x00
-
-EVENT_RXL_OCCUPANCY_DRS           0x15  SBOX
-UMASK_RXL_OCCUPANCY_DRS           0x00
-
-EVENT_RXL_OCCUPANCY_HOM           0x18  SBOX
-UMASK_RXL_OCCUPANCY_HOM           0x00
-
-EVENT_RXL_OCCUPANCY_NCB           0x16  SBOX
-UMASK_RXL_OCCUPANCY_NCB           0x00
-
-EVENT_RXL_OCCUPANCY_NCS           0x17  SBOX
-UMASK_RXL_OCCUPANCY_NCS           0x00
-
-EVENT_RXL_OCCUPANCY_NDR           0x1A  SBOX
-UMASK_RXL_OCCUPANCY_NDR           0x00
-
-EVENT_RXL_OCCUPANCY_SNP           0x19  SBOX
-UMASK_RXL_OCCUPANCY_SNP           0x00
-
-EVENT_TXL0P_POWER_CYCLES           0x0D  SBOX
-UMASK_TXL0P_POWER_CYCLES           0x00
-
-EVENT_TXL0_POWER_CYCLES           0x0C  SBOX
-UMASK_TXL0_POWER_CYCLES           0x00
-
-EVENT_TXL_BYPASSED           0x05  SBOX
-UMASK_TXL_BYPASSED           0x00
-
-EVENT_TXL_CYCLES_NE           0x06  SBOX
-UMASK_TXL_CYCLES_NE           0x00
-
-EVENT_TXL_FLITS_G0              0x00  SBOX
-UMASK_TXL_FLITS_G0_IDLE         0x01
-UMASK_TXL_FLITS_G0_DATA         0x02
-UMASK_TXL_FLITS_G0_NON_DATA     0x04
-
-EVENT_TXL_FLITS_G1              0x00  SBOX
-UMASK_TXL_FLITS_G1_SNP          0x01
-UMASK_TXL_FLITS_G1_HOM_REQ      0x02
-UMASK_TXL_FLITS_G1_HOM_NONREQ   0x04
-UMASK_TXL_FLITS_G1_HOM          0x06
-UMASK_TXL_FLITS_G1_DRS_DATA     0x08
-UMASK_TXL_FLITS_G1_DRS_NONDATA  0x10
-UMASK_TXL_FLITS_G1_DRS          0x60
-
-EVENT_TXL_FLITS_G2              0x01  SBOX
-UMASK_TXL_FLITS_G2_NDR_AD       0x01
-UMASK_TXL_FLITS_G2_NDR_AK       0x02
-UMASK_TXL_FLITS_G2_NCB_DATA     0x04
-UMASK_TXL_FLITS_G2_NCB_NODATA   0x08
-UMASK_TXL_FLITS_G2_NCB          0x06
-UMASK_TXL_FLITS_G2_NCS          0x10
-
-EVENT_TXL_INSERTS           0x04  SBOX
-UMASK_TXL_INSERTS           0x00
-
-EVENT_TXL_OCCUPANCY           0x07  SBOX
-UMASK_TXL_OCCUPANCY           0x00
-
-EVENT_CREDIT_RETURNS           0x1C  SBOX
-UMASK_CREDIT_RETURNS           0x00
-
-EVENT_CREDIT_RETURN_OCCUPANCY           0x1B  SBOX
-UMASK_CREDIT_RETURN_OCCUPANCY           0x00
-
-EVENT_R2PCIE_CLOCKTICKS           0x01  PBOX
-UMASK_R2PCIE_CLOCKTICKS           0x00
-
-EVENT_RING_AD_USED                  0x07  PBOX
-UMASK_RING_AD_USED_CW_EVEN          0x01
-UMASK_RING_AD_USED_CW_ODD           0x02
-UMASK_RING_AD_USED_CCW_EVEN         0x04
-UMASK_RING_AD_USED_CCW_EVEN         0x08
-
-EVENT_RING_AK_USED                  0x08  PBOX
-UMASK_RING_AK_USED_CW_EVEN          0x01
-UMASK_RING_AK_USED_CW_ODD           0x02
-UMASK_RING_AK_USED_CCW_EVEN         0x04
-UMASK_RING_AK_USED_CCW_EVEN         0x08
-
-EVENT_RING_BL_USED                  0x09  PBOX
-UMASK_RING_BL_USED_CW_EVEN          0x01
-UMASK_RING_BL_USED_CW_ODD           0x02
-UMASK_RING_BL_USED_CCW_EVEN         0x04
-UMASK_RING_BL_USED_CCW_EVEN         0x08
-
-EVENT_RING_IV_USED                  0x0A  PBOX
-UMASK_RING_IV_USED_ANY              0x0F
-
-EVENT_RXR_AK_BOUNCES              0x12  PBOX0
-UMASK_RXR_AK_BOUNCES              0x00
-
-EVENT_RXR_CYCLES_NE              0x10  PBOX0|PBOX1
-UMASK_RXR_CYCLES_NE_DRS              0x08
-UMASK_RXR_CYCLES_NE_NCB              0x10
-UMASK_RXR_CYCLES_NE_NCS              0x20
-
-EVENT_TXR_CYCLES_FULL              0x25  PBOX0
-UMASK_TXR_CYCLES_FULL_AD              0x01
-UMASK_TXR_CYCLES_FULL_AK              0x02
-UMASK_TXR_CYCLES_FULL_BL              0x04
-
-EVENT_TXR_CYCLES_NE              0x23  PBOX0
-UMASK_TXR_CYCLES_NE_AD              0x01
-UMASK_TXR_CYCLES_NE_AK              0x02
-UMASK_TXR_CYCLES_NE_BL              0x04
-
-EVENT_TXR_INSERTS              0x24  PBOX0
-UMASK_TXR_INSERTS              0x00
-
-EVENT_R3QPI_CLOCKTICKS              0x01  RBOX
-UMASK_R3QPI_CLOCKTICKS              0x00
-
-EVENT_IIO_CREDITS_ACQUIRED              0x20  RBOX
-UMASK_IIO_CREDITS_ACQUIRED_DRS              0x08
-UMASK_IIO_CREDITS_ACQUIRED_NCB              0x10
-UMASK_IIO_CREDITS_ACQUIRED_NCS              0x20
-
-EVENT_IIO_CREDITS_REJECT              0x21  RBOX
-UMASK_IIO_CREDITS_REJECT_DRS              0x08
-UMASK_IIO_CREDITS_REJECT_NCB              0x10
-UMASK_IIO_CREDITS_REJECT_NCS              0x20
-
-EVENT_IIO_CREDITS_USED              0x22  RBOX
-UMASK_IIO_CREDITS_USED_DRS              0x08
-UMASK_IIO_CREDITS_USED_NCB              0x10
-UMASK_IIO_CREDITS_USED_NCS              0x20
-
-EVENT_RING_AD_USED              0x07  RBOX
-UMASK_RING_AD_USED_CW_EVEN      0x01
-UMASK_RING_AD_USED_CW_ODD       0x02
-UMASK_RING_AD_USED_CCW_EVEN     0x04
-UMASK_RING_AD_USED_CCW_ODD      0x08
-
-EVENT_RING_AK_USED              0x08  RBOX
-UMASK_RING_AK_USED_CW_EVEN      0x01
-UMASK_RING_AK_USED_CW_ODD       0x02
-UMASK_RING_AK_USED_CCW_EVEN     0x04
-UMASK_RING_AK_USED_CCW_ODD      0x08
-
-EVENT_RING_BL_USED              0x09  RBOX
-UMASK_RING_BL_USED_CW_EVEN      0x01
-UMASK_RING_BL_USED_CW_ODD       0x02
-UMASK_RING_BL_USED_CCW_EVEN     0x04
-UMASK_RING_BL_USED_CCW_ODD      0x08
-
-EVENT_RING_IV_USED          0x0A  RBOX
-UMASK_RING_IV_USED_ANY      0x0F
-
-EVENT_RXR_BYPASSED          0x12  RBOX
-UMASK_RXR_BYPASSED          0x00
-
-EVENT_RXR_CYCLES_NE         0x10  RBOX
-UMASK_RXR_CYCLES_NE_HOM     0x01
-UMASK_RXR_CYCLES_NE_SNP     0x02
-UMASK_RXR_CYCLES_NE_NDR     0x04
-UMASK_RXR_CYCLES_NE_DRS     0x08
-UMASK_RXR_CYCLES_NE_NCB     0x10
-UMASK_RXR_CYCLES_NE_NCS     0x20
-
-EVENT_RXR_INSERTS         0x10  RBOX
-UMASK_RXR_INSERTS_HOM     0x01
-UMASK_RXR_INSERTS_SNP     0x02
-UMASK_RXR_INSERTS_NDR     0x04
-UMASK_RXR_INSERTS_DRS     0x08
-UMASK_RXR_INSERTS_NCB     0x10
-UMASK_RXR_INSERTS_NCS     0x20
-
-EVENT_RXR_OCCUPANCY         0x13  RBOX
-UMASK_RXR_OCCUPANCY_HOM     0x01
-UMASK_RXR_OCCUPANCY_SNP     0x02
-UMASK_RXR_OCCUPANCY_NDR     0x04
-UMASK_RXR_OCCUPANCY_DRS     0x08
-UMASK_RXR_OCCUPANCY_NCB     0x10
-UMASK_RXR_OCCUPANCY_NCS     0x20
-
-EVENT_TXR_CYCLES_FULL       0x25  RBOX
-UMASK_TXR_CYCLES_FULL       0x00
-
-EVENT_TXR_CYCLES_NE       0x23  RBOX
-UMASK_TXR_CYCLES_NE       0x00
-
-EVENT_TXR_INSERTS       0x24  RBOX
-UMASK_TXR_INSERTS       0x00
-
-EVENT_TXR_NACK       0x26  RBOX
-UMASK_TXR_NACK       0x00
-
-EVENT_VN0_CREDITS_REJECT      0x37  RBOX
-UMASK_VN0_CREDITS_REJECT_HOM     0x01
-UMASK_VN0_CREDITS_REJECT_SNP     0x02
-UMASK_VN0_CREDITS_REJECT_NDR     0x04
-UMASK_VN0_CREDITS_REJECT_DRS     0x08
-UMASK_VN0_CREDITS_REJECT_NCB     0x10
-UMASK_VN0_CREDITS_REJECT_NCS     0x20
-
-EVENT_VN0_CREDITS_USED      0x36  RBOX
-UMASK_VN0_CREDITS_USED_HOM     0x01
-UMASK_VN0_CREDITS_USED_SNP     0x02
-UMASK_VN0_CREDITS_USED_NDR     0x04
-UMASK_VN0_CREDITS_USED_DRS     0x08
-UMASK_VN0_CREDITS_USED_NCB     0x10
-UMASK_VN0_CREDITS_USED_NCS     0x20
-
-EVENT_VNA_CREDITS_ACQUIRED      0x33  RBOX
-UMASK_VNA_CREDITS_ACQUIRED     0x00
-
-EVENT_VNA_CREDITS_REJECT      0x34  RBOX
-UMASK_VNA_CREDITS_REJECT_HOM     0x01
-UMASK_VNA_CREDITS_REJECT_SNP     0x02
-UMASK_VNA_CREDITS_REJECT_NDR     0x04
-UMASK_VNA_CREDITS_REJECT_DRS     0x08
-UMASK_VNA_CREDITS_REJECT_NCB     0x10
-UMASK_VNA_CREDITS_REJECT_NCS     0x20
-
-EVENT_VNA_CREDITS_CYCLES_OUT      0x31  RBOX
-UMASK_VNA_CREDITS_CYCLES_OUT     0x00
-
-EVENT_VNA_CREDITS_CYCLES_USED      0x32  RBOX
-UMASK_VNA_CREDITS_CYCLESUSED     0x00
-
diff --git a/src/includes/perfmon_haswell.h b/src/includes/perfmon_silvermont.h
similarity index 63%
copy from src/includes/perfmon_haswell.h
copy to src/includes/perfmon_silvermont.h
index 0352476..9cfd6f1 100644
--- a/src/includes/perfmon_haswell.h
+++ b/src/includes/perfmon_silvermont.h
@@ -1,12 +1,12 @@
 /*
  * =======================================================================================
  *
- *      Filename:  perfmon_haswell.h
+ *      Filename:  perfmon_silvermont.h
  *
- *      Description:  Header File of perfmon module for Haswell.
+ *      Description:  Header file of perfmon module for Intel Atom Silvermont
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -27,95 +27,58 @@
  *
  * =======================================================================================
  */
+ 
+#include <perfmon_silvermont_events.h>
+#include <perfmon_silvermont_groups.h>
+#include <perfmon_silvermont_counters.h>
 
-#include <perfmon_haswell_events.h>
-#include <perfmon_haswell_groups.h>
-#include <perfmon_haswell_counters.h>
+static int perfmon_numCountersSilvermont = NUM_COUNTERS_SILVERMONT;
+static int perfmon_numGroupsSilvermont = NUM_GROUPS_SILVERMONT;
+static int perfmon_numArchEventsSilvermont = NUM_ARCH_EVENTS_SILVERMONT;
 
-static int perfmon_numCountersHaswell = NUM_COUNTERS_HASWELL;
-static int perfmon_numGroupsHaswell = NUM_GROUPS_HASWELL;
-static int perfmon_numArchEventsHaswell = NUM_ARCH_EVENTS_HASWELL;
 
-
-#define OFFSET_PMC 3
-
-void perfmon_init_haswell(PerfmonThread *thread)
+void perfmon_init_silvermont(PerfmonThread *thread)
 {
     uint64_t flags = 0x0ULL;
     int cpu_id = thread->processorId;
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
 
     /* Initialize registers */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
     msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
     msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
+    msr_write(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL);
+    msr_write(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL);
+
+    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
     msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
     msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
-    /* initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted core
-     * FIXED 2: Clocks unhalted ref */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
-    /* Preinit of PERFEVSEL registers */
-    flags |= (1<<22);  /* enable flag */
-    flags |= (1<<16);  /* user mode flag */
-
-    msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
-    {
-
-    }
-
 }
 
-void perfmon_setupCounterThread_haswell(
+void perfmon_setupCounterThread_silvermont(
         int thread_id,
         PerfmonEvent* event,
         PerfmonCounterIndex index)
 {
     int haveLock = 0;
-    uint64_t flags;
+    uint64_t flags = 0x0ULL;
     uint32_t uflags;
-    uint64_t reg = haswell_counter_map[index].configRegister;
+    uint64_t reg = silvermont_counter_map[index].configRegister;
     int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
+    uint64_t orig_fixed_flags = fixed_flags;
     perfmon_threadData[thread_id].counters[index].init = TRUE;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
-    {
-        haveLock = 1;
-    }
-
-    switch (haswell_counter_map[index].type)
+    switch (silvermont_counter_map[index].type)
     {
         case PMC:
 
-            flags = msr_read(cpu_id,reg);
+            flags = (1<<16)|(1<<22);
             flags &= ~(0xFFFFU);   /* clear lower 16bits */
 
             /* Intel with standard 8 bit event mask: [7:0] */
             flags |= (event->umask<<8) + event->eventId;
 
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
-            {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
-            }
+
 
             if (perfmon_verbose)
             {
@@ -124,11 +87,32 @@ void perfmon_setupCounterThread_haswell(
                         LLU_CAST reg,
                         LLU_CAST flags);
             }
-
             msr_write(cpu_id, reg , flags);
+
+            // Offcore event with additional configuration register
+            // We included the additional register as counterRegister2
+            // to avoid creating a new data structure
+            // cfgBits contain offset of "request type" bit
+            // cmask contain offset of "response type" bit
+            if (event->eventId == 0xB7) 
+            {
+                if (event->umask == 0x01)
+                {
+                    reg = MSR_OFFCORE_RESP0;
+                }
+                else if (event->umask == 0x02)
+                {
+                    reg = MSR_OFFCORE_RESP1;
+                }
+                flags = 0x0ULL;
+                flags = (1<<event->cfgBits)|(1<<event->cmask);
+                msr_write(cpu_id, reg , flags);
+            }
+
             break;
 
         case FIXED:
+            fixed_flags |= (2ULL<<(index*4));
             break;
 
         case POWER:
@@ -138,13 +122,19 @@ void perfmon_setupCounterThread_haswell(
             /* should never be reached */
             break;
     }
+    if (fixed_flags != orig_fixed_flags)
+    {
+        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+    }
 }
 
-void perfmon_startCountersThread_haswell(int thread_id)
+
+void perfmon_startCountersThread_silvermont(int thread_id)
 {
     int haveLock = 0;
     uint64_t flags = 0x0ULL;
     uint32_t uflags = 0x10000UL; /* Clear freeze bit */
+    uint64_t fixed_flags = 0x0ULL;
     int cpu_id = perfmon_threadData[thread_id].processorId;
 
     if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
@@ -154,19 +144,19 @@ void perfmon_startCountersThread_haswell(int thread_id)
 
     msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
 
-    for ( int i=0; i<perfmon_numCountersHaswell; i++ )
+    for ( int i=0; i<perfmon_numCountersSilvermont; i++ )
     {
         if (perfmon_threadData[thread_id].counters[i].init == TRUE)
         {
-            switch (haswell_counter_map[i].type)
+            switch (silvermont_counter_map[i].type)
             {
                 case PMC:
-                    msr_write(cpu_id, haswell_counter_map[i].counterRegister, 0x0ULL);
+                    msr_write(cpu_id, silvermont_counter_map[i].counterRegister, 0x0ULL);
                     flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
                     break;
 
                 case FIXED:
-                    msr_write(cpu_id, haswell_counter_map[i].counterRegister, 0x0ULL);
+                    msr_write(cpu_id, silvermont_counter_map[i].counterRegister, 0x0ULL);
                     flags |= (1ULL<<(i+32));  /* enable fixed counter */
                     break;
 
@@ -174,7 +164,7 @@ void perfmon_startCountersThread_haswell(int thread_id)
                     if(haveLock)
                     {
                         perfmon_threadData[thread_id].counters[i].counterData =
-                            power_read(cpu_id, haswell_counter_map[i].counterRegister);
+                            power_read(cpu_id, silvermont_counter_map[i].counterRegister);
                     }
 
                     break;
@@ -193,12 +183,14 @@ void perfmon_startCountersThread_haswell(int thread_id)
         printf("perfmon_start_counters: Write Register 0x%X , \
                 Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
     }
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+    if (flags != 0x0ULL)
+    {
+        msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
+    }
 }
 
-void perfmon_stopCountersThread_haswell(int thread_id)
+
+void perfmon_stopCountersThread_silvermont(int thread_id)
 {
     uint64_t flags;
     uint32_t uflags = 0x10100UL; /* Set freeze bit */
@@ -213,17 +205,17 @@ void perfmon_stopCountersThread_haswell(int thread_id)
 
     msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
 
-    for ( int i=0; i < perfmon_numCountersHaswell; i++ ) 
+    for ( int i=0; i < perfmon_numCountersSilvermont; i++ ) 
     {
         if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
         {
-            switch (haswell_counter_map[i].type)
+            switch (silvermont_counter_map[i].type)
             {
                 case PMC:
 
                 case FIXED:
                     perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, haswell_counter_map[i].counterRegister);
+                        (double)msr_read(cpu_id, silvermont_counter_map[i].counterRegister);
                     break;
 
                 case POWER:
@@ -231,7 +223,7 @@ void perfmon_stopCountersThread_haswell(int thread_id)
                     {
                         perfmon_threadData[thread_id].counters[i].counterData =
                             power_info.energyUnit *
-                            ( power_read(cpu_id, haswell_counter_map[i].counterRegister) -
+                            ( power_read(cpu_id, silvermont_counter_map[i].counterRegister) -
                               perfmon_threadData[thread_id].counters[i].counterData);
                     }
                     break;
@@ -256,7 +248,7 @@ void perfmon_stopCountersThread_haswell(int thread_id)
     }
 }
 
-void perfmon_readCountersThread_haswell(int thread_id)
+void perfmon_readCountersThread_silvermont(int thread_id)
 {
     uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
@@ -267,26 +259,26 @@ void perfmon_readCountersThread_haswell(int thread_id)
         haveLock = 1;
     }
 
-    for ( int i=0; i<perfmon_numCountersHaswell; i++ )
+    for ( int i=0; i<perfmon_numCountersSilvermont; i++ )
     {
         if (perfmon_threadData[thread_id].counters[i].init == TRUE)
         {
-            if ((haswell_counter_map[i].type == PMC) ||
-                    (haswell_counter_map[i].type == FIXED))
+            if ((silvermont_counter_map[i].type == PMC) ||
+                    (silvermont_counter_map[i].type == FIXED))
             {
                 perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, haswell_counter_map[i].counterRegister);
+                    msr_read(cpu_id, silvermont_counter_map[i].counterRegister);
             }
             else
             {
                 if(haveLock)
                 {
-                    switch (haswell_counter_map[i].type)
+                    switch (silvermont_counter_map[i].type)
                     {
                         case POWER:
                             perfmon_threadData[thread_id].counters[i].counterData =
                                 power_info.energyUnit *
-                                power_read(cpu_id, haswell_counter_map[i].counterRegister);
+                                power_read(cpu_id, silvermont_counter_map[i].counterRegister);
                             break;
 
                         default:
@@ -298,4 +290,3 @@ void perfmon_readCountersThread_haswell(int thread_id)
         }
     }
 }
-
diff --git a/src/includes/perfmon_haswell_counters.h b/src/includes/perfmon_silvermont_counters.h
similarity index 68%
copy from src/includes/perfmon_haswell_counters.h
copy to src/includes/perfmon_silvermont_counters.h
index 4302efe..266ee4b 100644
--- a/src/includes/perfmon_haswell_counters.h
+++ b/src/includes/perfmon_silvermont_counters.h
@@ -1,12 +1,12 @@
 /*
  * =======================================================================================
  *
- *      Filename:  perfmon_haswell_counters.h
+ *      Filename:  perfmon_silvermont_counters.h
  *
- *      Description:  Counter Header File of perfmon module for Haswell.
+ *      Description: Counter header file of perfmon module for Silvermont.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -28,10 +28,11 @@
  * =======================================================================================
  */
 
-#define NUM_COUNTERS_HASWELL 12
-#define NUM_COUNTERS_CORE_HASWELL 7
+#define NUM_COUNTERS_CORE_SILVERMONT 6
+#define NUM_COUNTERS_UNCORE_SILVERMONT 0
+#define NUM_COUNTERS_SILVERMONT 8
 
-static PerfmonCounterMap haswell_counter_map[NUM_COUNTERS_HASWELL] = {
+static PerfmonCounterMap silvermont_counter_map[NUM_COUNTERS_SILVERMONT] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
     {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
     {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
@@ -39,14 +40,11 @@ static PerfmonCounterMap haswell_counter_map[NUM_COUNTERS_HASWELL] = {
     /* PMC Counters: 4 48bit wide */
     {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
     {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
-    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
-    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
     /* Temperature Sensor*/
-    {"TMP0", PMC7, THERMAL, 0, 0, 0, 0},
+    {"TMP0", PMC5, THERMAL, 0, 0, 0, 0},
     /* RAPL counters */
-    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR1", PMC9, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR2", PMC10, POWER, 0, MSR_PKG_ENERGY_STATUS,  0, 0},
-    {"PWR3", PMC11, POWER, 0, MSR_PKG_ENERGY_STATUS,  0, 0},
+    {"PWR0", PMC6, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
+    {"PWR1", PMC7, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
 };
 
+
diff --git a/src/includes/perfmon_silvermont_events.txt b/src/includes/perfmon_silvermont_events.txt
new file mode 100644
index 0000000..b8a088d
--- /dev/null
+++ b/src/includes/perfmon_silvermont_events.txt
@@ -0,0 +1,440 @@
+# =======================================================================================
+#  
+#      Filename:  perfmon_silvermont_events.txt
+# 
+#      Description:  Event list for Intel Atom (Silvermont)
+# 
+#      Version:   3.1.3
+#      Released:  4.11.2014
+# 
+#      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2014 Jan Treibig
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE          0x00   TMP0
+UMASK_TEMP_CORE          0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_INSTR_RETIRED              0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY          0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE      0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF       0x00
+
+EVENT_REHABQ                    0x03  PMC
+UMASK_REHABQ_LD_BLOCK_ST_FORWARD    0x01
+UMASK_REHABQ_LD_BLOCK_STD_NOTREADY  0x02
+UMASK_REHABQ_ST_SPLITS              0x04
+UMASK_REHABQ_LD_SPLITS              0x08
+UMASK_REHABQ_LOCK                   0x10
+UMASK_REHABQ_STA_FULL               0x20
+UMASK_REHABQ_ANY_LD                 0x40
+UMASK_REHABQ_ANY_ST                 0x80
+
+EVENT_MEM_UOPS_RETIRED            0x04  PMC
+UMASK_MEM_UOPS_RETIRED_L1_MISS_LOADS 0x01
+UMASK_MEM_UOPS_RETIRED_L2_HIT_LOADS 0x02
+UMASK_MEM_UOPS_RETIRED_L2_MISS_LOADS 0x04
+UMASK_MEM_UOPS_RETIRED_DTLB_MISS_LOADS 0x08
+UMASK_MEM_UOPS_RETIRED_UTLB_MISS    0x10
+UMASK_MEM_UOPS_RETIRED_HITM         0x20
+UMASK_MEM_UOPS_RETIRED_ALL_LOADS    0x40
+UMASK_MEM_UOPS_RETIRED_ALL_STORES   0x80
+
+EVENT_PAGE_WALKS                    0x05 PMC
+UMASK_PAGE_WALKS_D_SIDE_CYCLES      0x01
+UMASK_PAGE_WALKS_I_SIDE_CYCLES      0x02
+UMASK_PAGE_WALKS_WALKS              0x03
+
+EVENT_LONGEST_LAT_CACHE             0x2E PMC
+UMASK_LONGEST_LAT_CACHE_MISS        0x41
+UMASK_LONGEST_LAT_CACHE_REFERENCE   0x4F
+
+EVENT_L2_REJECT_XQ                  0x30 PMC
+UMASK_L2_REJECT_XQ_ALL              0x00
+
+EVENT_CORE_REJECT_L2Q               0x31 PMC
+UMASK_CORE_REJECT_L2Q_ALL           0x00
+
+EVENT_CPU_CLK_UNHALTED              0x3C PMC
+UMASK_CPU_CLK_UNHALTED_CORE_P       0x00
+UMASK_CPU_CLK_UNHALTED_REF_P        0x01
+
+EVENT_ICACHE                        0x80 PMC
+UMASK_ICACHE_HIT                    0x01
+UMASK_ICACHE_MISSES                 0x02
+UMASK_ICACHE_ACCESSES               0x03
+UMASK_ICACHE_IFETCH_STALL           0x04
+
+EVENT_NIP_STALL                     0xB6 PMC
+UMASK_NIP_STALL_ICACHE_MISS         0x04
+
+EVENT_OFFCORE_RESPONSE              0xB7 PMC
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_L2_HIT        0x01 0x00 0x12
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_SNP_NONE      0x01 0x00 0x1F
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_SNOOP_MISS    0x01 0x00 0x21
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_SNOOP_HIT     0x01 0x00 0x22
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_HITM          0x01 0x00 0x24
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_NON_DRAM      0x01 0x00 0x25
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_AVG_LAT       0x01 0x00 0x26
+
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY           0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_L2_HIT        0x01 0x01 0x12
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_SNP_NONE      0x01 0x01 0x1F
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_SNOOP_MISS    0x01 0x01 0x21
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_SNOOP_HIT     0x01 0x01 0x22
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_HITM          0x01 0x01 0x24
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_NON_DRAM      0x01 0x01 0x25
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_AVG_LAT       0x01 0x01 0x26
+
+UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_L2_HIT        0x01 0x02 0x12
+UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_SNP_NONE      0x01 0x02 0x1F
+UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_SNOOP_MISS    0x01 0x02 0x21
+UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_SNOOP_HIT     0x01 0x02 0x22
+UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_HITM          0x01 0x02 0x24
+UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_NON_DRAM      0x01 0x02 0x25
+UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_AVG_LAT       0x01 0x02 0x26
+
+UMASK_OFFCORE_RESPONSE_0_WB_ANY           0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_L2_HIT        0x01 0x03 0x12
+UMASK_OFFCORE_RESPONSE_0_WB_SNP_NONE      0x01 0x03 0x1F
+UMASK_OFFCORE_RESPONSE_0_WB_SNOOP_MISS    0x01 0x03 0x21
+UMASK_OFFCORE_RESPONSE_0_WB_SNOOP_HIT     0x01 0x03 0x22
+UMASK_OFFCORE_RESPONSE_0_WB_HITM          0x01 0x03 0x24
+UMASK_OFFCORE_RESPONSE_0_WB_NON_DRAM      0x01 0x03 0x25
+UMASK_OFFCORE_RESPONSE_0_WB_AVG_LAT       0x01 0x03 0x26
+
+UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_ANY           0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_L2_HIT        0x01 0x04 0x12
+UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_SNP_NONE      0x01 0x04 0x1F
+UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_SNOOP_MISS    0x01 0x04 0x21
+UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_SNOOP_HIT     0x01 0x04 0x22
+UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_HITM          0x01 0x04 0x24
+UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_NON_DRAM      0x01 0x04 0x25
+UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_AVG_LAT       0x01 0x04 0x26
+
+UMASK_OFFCORE_RESPONSE_0_PF_RFO_ANY           0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_RFO_L2_HIT        0x01 0x05 0x12
+UMASK_OFFCORE_RESPONSE_0_PF_RFO_SNP_NONE      0x01 0x05 0x1F
+UMASK_OFFCORE_RESPONSE_0_PF_RFO_SNOOP_MISS    0x01 0x05 0x21
+UMASK_OFFCORE_RESPONSE_0_PF_RFO_SNOOP_HIT     0x01 0x05 0x22
+UMASK_OFFCORE_RESPONSE_0_PF_RFO_HITM          0x01 0x05 0x24
+UMASK_OFFCORE_RESPONSE_0_PF_RFO_NON_DRAM      0x01 0x05 0x25
+UMASK_OFFCORE_RESPONSE_0_PF_RFO_AVG_LAT       0x01 0x05 0x26
+
+UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_ANY           0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_L2_HIT        0x01 0x06 0x12
+UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_SNP_NONE      0x01 0x06 0x1F
+UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_SNOOP_MISS    0x01 0x06 0x21
+UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_SNOOP_HIT     0x01 0x06 0x22
+UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_HITM          0x01 0x06 0x24
+UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_NON_DRAM      0x01 0x06 0x25
+UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_AVG_LAT       0x01 0x06 0x26
+
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_ANY           0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_L2_HIT        0x01 0x07 0x12
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_SNP_NONE      0x01 0x07 0x1F
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_SNOOP_MISS    0x01 0x07 0x21
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_SNOOP_HIT     0x01 0x07 0x22
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_HITM          0x01 0x07 0x24
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_NON_DRAM      0x01 0x07 0x25
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_AVG_LAT       0x01 0x07 0x26
+
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_ANY           0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_L2_HIT        0x01 0x08 0x12
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_SNP_NONE      0x01 0x08 0x1F
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_SNOOP_MISS    0x01 0x08 0x21
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_SNOOP_HIT     0x01 0x08 0x22
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_HITM          0x01 0x08 0x24
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_NON_DRAM      0x01 0x08 0x25
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_AVG_LAT       0x01 0x08 0x26
+
+UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_ANY           0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_L2_HIT        0x01 0x09 0x12
+UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_SNP_NONE      0x01 0x09 0x1F
+UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_SNOOP_MISS    0x01 0x09 0x21
+UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_SNOOP_HIT     0x01 0x09 0x22
+UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_HITM          0x01 0x09 0x24
+UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_NON_DRAM      0x01 0x09 0x25
+UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_AVG_LAT       0x01 0x09 0x26
+
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_ANY           0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_L2_HIT        0x01 0x0A 0x12
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_SNP_NONE      0x01 0x0A 0x1F
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_SNOOP_MISS    0x01 0x0A 0x21
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_SNOOP_HIT     0x01 0x0A 0x22
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_HITM          0x01 0x0A 0x24
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_NON_DRAM      0x01 0x0A 0x25
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_AVG_LAT       0x01 0x0A 0x26
+
+UMASK_OFFCORE_RESPONSE_0_STRM_ST_ANY           0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_STRM_ST_L2_HIT        0x01 0x0B 0x12
+UMASK_OFFCORE_RESPONSE_0_STRM_ST_SNP_NONE      0x01 0x0B 0x1F
+UMASK_OFFCORE_RESPONSE_0_STRM_ST_SNOOP_MISS    0x01 0x0B 0x21
+UMASK_OFFCORE_RESPONSE_0_STRM_ST_SNOOP_HIT     0x01 0x0B 0x22
+UMASK_OFFCORE_RESPONSE_0_STRM_ST_HITM          0x01 0x0B 0x24
+UMASK_OFFCORE_RESPONSE_0_STRM_ST_NON_DRAM      0x01 0x0B 0x25
+UMASK_OFFCORE_RESPONSE_0_STRM_ST_AVG_LAT       0x01 0x0B 0x26
+
+UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_ANY           0x01 0x0C 0x10
+UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_L2_HIT        0x01 0x0C 0x12
+UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_SNP_NONE      0x01 0x0C 0x1F
+UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_SNOOP_MISS    0x01 0x0C 0x21
+UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_SNOOP_HIT     0x01 0x0C 0x22
+UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_HITM          0x01 0x0C 0x24
+UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_NON_DRAM      0x01 0x0C 0x25
+UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_AVG_LAT       0x01 0x0C 0x26
+
+UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_ANY           0x01 0x0D 0x10
+UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_L2_HIT        0x01 0x0D 0x12
+UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_SNP_NONE      0x01 0x0D 0x1F
+UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_SNOOP_MISS    0x01 0x0D 0x21
+UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_SNOOP_HIT     0x01 0x0D 0x22
+UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_HITM          0x01 0x0D 0x24
+UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_NON_DRAM      0x01 0x0D 0x25
+UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_AVG_LAT       0x01 0x0D 0x26
+
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_ANY           0x01 0x0E 0x10
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_L2_HIT        0x01 0x0E 0x12
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_SNP_NONE      0x01 0x0E 0x1F
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_SNOOP_MISS    0x01 0x0E 0x21
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_SNOOP_HIT     0x01 0x0E 0x22
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_HITM          0x01 0x0E 0x24
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_NON_DRAM      0x01 0x0E 0x25
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_AVG_LAT       0x01 0x0E 0x26
+
+UMASK_OFFCORE_RESPONSE_0_ANY_ANY           0x01 0x0F 0x10
+UMASK_OFFCORE_RESPONSE_0_ANY_L2_HIT        0x01 0x0F 0x12
+UMASK_OFFCORE_RESPONSE_0_ANY_SNP_NONE      0x01 0x0F 0x1F
+UMASK_OFFCORE_RESPONSE_0_ANY_SNOOP_MISS    0x01 0x0F 0x21
+UMASK_OFFCORE_RESPONSE_0_ANY_SNOOP_HIT     0x01 0x0F 0x22
+UMASK_OFFCORE_RESPONSE_0_ANY_HITM          0x01 0x0F 0x24
+UMASK_OFFCORE_RESPONSE_0_ANY_NON_DRAM      0x01 0x0F 0x25
+UMASK_OFFCORE_RESPONSE_0_ANY_AVG_LAT       0x01 0x0F 0x26
+
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x02 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_L2_HIT        0x02 0x00 0x12
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_SNP_NONE      0x02 0x00 0x1F
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_SNOOP_MISS    0x02 0x00 0x21
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_SNOOP_HIT     0x02 0x00 0x22
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_HITM          0x02 0x00 0x24
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_NON_DRAM      0x02 0x00 0x25
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_AVG_LAT       0x02 0x00 0x26
+
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY           0x02 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_L2_HIT        0x02 0x01 0x12
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_SNP_NONE      0x02 0x01 0x1F
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_SNOOP_MISS    0x02 0x01 0x21
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_SNOOP_HIT     0x02 0x01 0x22
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_HITM          0x02 0x01 0x24
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_NON_DRAM      0x02 0x01 0x25
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_AVG_LAT       0x02 0x01 0x26
+
+UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_ANY           0x02 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_L2_HIT        0x02 0x02 0x12
+UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_SNP_NONE      0x02 0x02 0x1F
+UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_SNOOP_MISS    0x02 0x02 0x21
+UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_SNOOP_HIT     0x02 0x02 0x22
+UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_HITM          0x02 0x02 0x24
+UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_NON_DRAM      0x02 0x02 0x25
+UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_AVG_LAT       0x02 0x02 0x26
+
+UMASK_OFFCORE_RESPONSE_1_WB_ANY           0x02 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_L2_HIT        0x02 0x03 0x12
+UMASK_OFFCORE_RESPONSE_1_WB_SNP_NONE      0x02 0x03 0x1F
+UMASK_OFFCORE_RESPONSE_1_WB_SNOOP_MISS    0x02 0x03 0x21
+UMASK_OFFCORE_RESPONSE_1_WB_SNOOP_HIT     0x02 0x03 0x22
+UMASK_OFFCORE_RESPONSE_1_WB_HITM          0x02 0x03 0x24
+UMASK_OFFCORE_RESPONSE_1_WB_NON_DRAM      0x02 0x03 0x25
+UMASK_OFFCORE_RESPONSE_1_WB_AVG_LAT       0x02 0x03 0x26
+
+UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_ANY           0x02 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_L2_HIT        0x02 0x04 0x12
+UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_SNP_NONE      0x02 0x04 0x1F
+UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_SNOOP_MISS    0x02 0x04 0x21
+UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_SNOOP_HIT     0x02 0x04 0x22
+UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_HITM          0x02 0x04 0x24
+UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_NON_DRAM      0x02 0x04 0x25
+UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_AVG_LAT       0x02 0x04 0x26
+
+UMASK_OFFCORE_RESPONSE_1_PF_RFO_ANY           0x02 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_RFO_L2_HIT        0x02 0x05 0x12
+UMASK_OFFCORE_RESPONSE_1_PF_RFO_SNP_NONE      0x02 0x05 0x1F
+UMASK_OFFCORE_RESPONSE_1_PF_RFO_SNOOP_MISS    0x02 0x05 0x21
+UMASK_OFFCORE_RESPONSE_1_PF_RFO_SNOOP_HIT     0x02 0x05 0x22
+UMASK_OFFCORE_RESPONSE_1_PF_RFO_HITM          0x02 0x05 0x24
+UMASK_OFFCORE_RESPONSE_1_PF_RFO_NON_DRAM      0x02 0x05 0x25
+UMASK_OFFCORE_RESPONSE_1_PF_RFO_AVG_LAT       0x02 0x05 0x26
+
+UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_ANY           0x02 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_L2_HIT        0x02 0x06 0x12
+UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_SNP_NONE      0x02 0x06 0x1F
+UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_SNOOP_MISS    0x02 0x06 0x21
+UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_SNOOP_HIT     0x02 0x06 0x22
+UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_HITM          0x02 0x06 0x24
+UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_NON_DRAM      0x02 0x06 0x25
+UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_AVG_LAT       0x02 0x06 0x26
+
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_ANY           0x02 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_L2_HIT        0x02 0x07 0x12
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_SNP_NONE      0x02 0x07 0x1F
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_SNOOP_MISS    0x02 0x07 0x21
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_SNOOP_HIT     0x02 0x07 0x22
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_HITM          0x02 0x07 0x24
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_NON_DRAM      0x02 0x07 0x25
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_AVG_LAT       0x02 0x07 0x26
+
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_ANY           0x02 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_L2_HIT        0x02 0x08 0x12
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_SNP_NONE      0x02 0x08 0x1F
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_SNOOP_MISS    0x02 0x08 0x21
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_SNOOP_HIT     0x02 0x08 0x22
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_HITM          0x02 0x08 0x24
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_NON_DRAM      0x02 0x08 0x25
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_AVG_LAT       0x02 0x08 0x26
+
+UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_ANY           0x02 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_L2_HIT        0x02 0x09 0x12
+UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_SNP_NONE      0x02 0x09 0x1F
+UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_SNOOP_MISS    0x02 0x09 0x21
+UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_SNOOP_HIT     0x02 0x09 0x22
+UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_HITM          0x02 0x09 0x24
+UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_NON_DRAM      0x02 0x09 0x25
+UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_AVG_LAT       0x02 0x09 0x26
+
+UMASK_OFFCORE_RESPONSE_1 BUS_LOCKS_ANY           0x02 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_L2_HIT        0x02 0x0A 0x12
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_SNP_NONE      0x02 0x0A 0x1F
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_SNOOP_MISS    0x02 0x0A 0x21
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_SNOOP_HIT     0x02 0x0A 0x22
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_HITM          0x02 0x0A 0x24
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_NON_DRAM      0x02 0x0A 0x25
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_AVG_LAT       0x02 0x0A 0x26
+
+UMASK_OFFCORE_RESPONSE_1_STRM_ST_ANY           0x02 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_STRM_ST_L2_HIT        0x02 0x0B 0x12
+UMASK_OFFCORE_RESPONSE_1_STRM_ST_SNP_NONE      0x02 0x0B 0x1F
+UMASK_OFFCORE_RESPONSE_1_STRM_ST_SNOOP_MISS    0x02 0x0B 0x21
+UMASK_OFFCORE_RESPONSE_1_STRM_ST_SNOOP_HIT     0x02 0x0B 0x22
+UMASK_OFFCORE_RESPONSE_1_STRM_ST_HITM          0x02 0x0B 0x24
+UMASK_OFFCORE_RESPONSE_1_STRM_ST_NON_DRAM      0x02 0x0B 0x25
+UMASK_OFFCORE_RESPONSE_1_STRM_ST_AVG_LAT       0x02 0x0B 0x26
+
+UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_ANY           0x02 0x0C 0x10
+UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_L2_HIT        0x02 0x0C 0x12
+UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_SNP_NONE      0x02 0x0C 0x1F
+UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_SNOOP_MISS    0x02 0x0C 0x21
+UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_SNOOP_HIT     0x02 0x0C 0x22
+UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_HITM          0x02 0x0C 0x24
+UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_NON_DRAM      0x02 0x0C 0x25
+UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_AVG_LAT       0x02 0x0C 0x26
+
+UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_ANY           0x02 0x0D 0x10
+UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_L2_HIT        0x02 0x0D 0x12
+UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_SNP_NONE      0x02 0x0D 0x1F
+UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_SNOOP_MISS    0x02 0x0D 0x21
+UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_SNOOP_HIT     0x02 0x0D 0x22
+UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_HITM          0x02 0x0D 0x24
+UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_NON_DRAM      0x02 0x0D 0x25
+UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_AVG_LAT       0x02 0x0D 0x26
+
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_ANY           0x02 0x0E 0x10
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_L2_HIT        0x02 0x0E 0x12
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_SNP_NONE      0x02 0x0E 0x1F
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_SNOOP_MISS    0x02 0x0E 0x21
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_SNOOP_HIT     0x02 0x0E 0x22
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_HITM          0x02 0x0E 0x24
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_NON_DRAM      0x02 0x0E 0x25
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_AVG_LAT       0x02 0x0E 0x26
+
+UMASK_OFFCORE_RESPONSE_1_ANY_ANY           0x02 0x0F 0x10
+UMASK_OFFCORE_RESPONSE_1_ANY_L2_HIT        0x02 0x0F 0x12
+UMASK_OFFCORE_RESPONSE_1_ANY_SNP_NONE      0x02 0x0F 0x1F
+UMASK_OFFCORE_RESPONSE_1_ANY_SNOOP_MISS    0x02 0x0F 0x21
+UMASK_OFFCORE_RESPONSE_1_ANY_SNOOP_HIT     0x02 0x0F 0x22
+UMASK_OFFCORE_RESPONSE_1_ANY_HITM          0x02 0x0F 0x24
+UMASK_OFFCORE_RESPONSE_1_ANY_NON_DRAM      0x02 0x0F 0x25
+UMASK_OFFCORE_RESPONSE_1_ANY_AVG_LAT       0x02 0x0F 0x26
+
+
+EVENT_INST_RETIRED                  0xC0 PMC
+UMASK_INST_RETIRED_ANY_P            0x00
+
+EVENT_UOPS_RETIRED                  0xC2 PMC
+UMASK_UOPS_RETIRED_MS               0x01
+UMASK_UOPS_RETIRED_ALL              0x10
+
+EVENT_MACHINE_CLEARS                0xC3 PMC
+UMASK_MACHINE_CLEARS_SMC            0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
+UMASK_MACHINE_CLEARS_FP_ASSIST      0x04
+UMASK_MACHINE_CLEARS_ALL            0x08
+
+EVENT_BR_INST_RETIRED               0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_INST_RETIRED_JCC           0x7E
+UMASK_BR_INST_RETIRED_FAR_BRANCH    0xBF
+UMASK_BR_INST_RETIRED_NON_RETURN_IND 0xEB
+UMASK_BR_INST_RETIRED_RETURN        0xF7
+UMASK_BR_INST_RETIRED_CALL          0xF9
+UMASK_BR_INST_RETIRED_IND_CALL      0xFB
+UMASK_BR_INST_RETIRED_REL_CALL      0xFD
+UMASK_BR_INST_RETIRED_TAKEN_JCC     0xFE
+
+EVENT_BR_MISP_RETIRED               0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_MISP_RETIRED_JCC           0x7E
+UMASK_BR_MISP_RETIRED_FAR_BRANCH    0xBF
+UMASK_BR_MISP_RETIRED_NON_RETURN_IND 0xEB
+UMASK_BR_MISP_RETIRED_RETURN        0xF7
+UMASK_BR_MISP_RETIRED_CALL          0xF9
+UMASK_BR_MISP_RETIRED_IND_CALL      0xFB
+UMASK_BR_MISP_RETIRED_REL_CALL      0xFD
+UMASK_BR_MISP_RETIRED_TAKEN_JCC     0xFE
+
+EVENT_NO_ALLOC_CYCLES               0xCA PMC
+UMASK_NO_ALLOC_CYCLES_ROB_FULL      0x01
+UMASK_NO_ALLOC_CYCLES_RAT_STALL     0x20
+UMASK_NO_ALLOC_CYCLES_ALL           0x3F
+UMASK_NO_ALLOC_CYCLES_NOT_DELIVERED 0x50
+
+EVENT_RS_FULL_STALL                 0xCB PMC
+UMASK_RS_FULL_STALL_MEC             0x01
+UMASK_RS_FULL_STALL_ALL             0x1F
+
+EVENT_CYCLES_DIV_BUSY               0xCD PMC
+UMASK_CYCLES_DIV_BUSY_ANY           0x01
+
+EVENT_BACLEARS                      0xE6 PMC
+UMASK_BACLEARS_ALL                  0x01
+UMASK_BACLEARS_RETURN               0x08
+UMASK_BACLEARS_COND                 0x10
+
+EVENT_MS_DECODED                    0xE7 PMC
+UMASK_MS_DECODED_MS_ENTRY           0x01
+
diff --git a/src/includes/perfmon_types.h b/src/includes/perfmon_types.h
index 1b47e95..1f0663a 100644
--- a/src/includes/perfmon_types.h
+++ b/src/includes/perfmon_types.h
@@ -7,8 +7,8 @@
  *                    Configures and reads out performance counters
  *                    on x86 based architectures. Supports multi threading.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -54,6 +54,10 @@ typedef enum {
     PMC67, PMC68, PMC69, PMC70, PMC71, PMC72,
     PMC73, PMC74, PMC75, PMC76, PMC77, PMC78,
     PMC79, PMC80, PMC81, PMC82, PMC83, PMC84,
+    PMC85, PMC86, PMC87, PMC88, PMC89, PMC90,
+    PMC91, PMC92, PMC93, PMC94, PMC95, PMC96,
+    PMC97, PMC98, PMC99, PMC100, PMC101, PMC102,
+    PMC103, PMC104, PMC105, PMC106, PMC107, PMC108,
     NUM_PMC} PerfmonCounterIndex;
 
 typedef enum {
@@ -86,26 +90,32 @@ typedef enum {
     CBOX9,
     CBOX10,
     CBOX11,
+    CBOX12,
+    CBOX13,
+    CBOX14,
     PBOX,
     POWER,
+    UBOX,
     NUM_UNITS} PerfmonType;
 
 typedef struct {
     char* key;
     PerfmonCounterIndex index;
-    PerfmonType  type;
-    uint64_t  configRegister;
-    uint64_t  counterRegister;
-    uint64_t  counterRegister2;
+    PerfmonType type;
+    uint64_t configRegister;
+    uint64_t counterRegister;
+    uint64_t counterRegister2;
     PciDeviceIndex device;
 } PerfmonCounterMap;
 
 typedef struct {
-    char* key;
+    const char* key;
     PerfmonGroup index;
     int isUncore;
-    char* info;
-    char* config;
+    const char* info;
+    const char* config;
+    int derivedCounters;
+    const char ** derivedCounterNames;
 } PerfmonGroupMap;
 
 typedef struct {
@@ -115,15 +125,15 @@ typedef struct {
 
 /* only used in westmereEX at the moment */
 typedef struct {
-    uint32_t  ctrlRegister;
-    uint32_t  statusRegister;
-    uint32_t  ovflRegister;
+    uint32_t ctrlRegister;
+    uint32_t statusRegister;
+    uint32_t ovflRegister;
 } PerfmonUnit;
 
 typedef struct {
-    int       init;
-    int       id;  /* TODO id is only used for EX type processors */
-    uint64_t  counterData;
+    int init;
+    int id;  /* TODO id is only used for EX type processors */
+    double counterData;
 } PerfmonCounter;
 
 typedef struct {
@@ -132,8 +142,8 @@ typedef struct {
 } PerfmonThread;
 
 typedef struct {
-    const char*    name;
-    const char*    limit;
+    const char* name;
+    const char* limit;
     uint16_t eventId;
     uint8_t umask;
     uint8_t cfgBits;
diff --git a/src/includes/perfmon_westmere.h b/src/includes/perfmon_westmere.h
index 80c9b91..c469766 100644
--- a/src/includes/perfmon_westmere.h
+++ b/src/includes/perfmon_westmere.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File of perfmon module for Westmere.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_westmereEX.h b/src/includes/perfmon_westmereEX.h
index c58a1fd..8cbc921 100644
--- a/src/includes/perfmon_westmereEX.h
+++ b/src/includes/perfmon_westmereEX.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header File of perfmon module for Westmere EX.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -66,16 +66,16 @@ void perfmon_init_westmereEX(PerfmonThread *thread)
      * FIXED 0: Instructions retired
      * FIXED 1: Clocks unhalted core
      * FIXED 2: Clocks unhalted ref */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
+    //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
 
     /* Preinit of PERFEVSEL registers */
-    flags |= (1<<22);  /* enable flag */
-    flags |= (1<<16);  /* user mode flag */
+    //flags |= (1<<22);  /* enable flag */
+    //flags |= (1<<16);  /* user mode flag */
 
-    msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
+    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
     msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
     msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);
+    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
 
     /* Initialize uncore */
     /* MBOX */
@@ -151,6 +151,64 @@ void perfmon_init_westmereEX(PerfmonThread *thread)
     westmereEX_PMunits[WBOX].statusRegister = MSR_W_PMON_BOX_STATUS;
     westmereEX_PMunits[WBOX].ovflRegister   = MSR_W_PMON_BOX_OVF_CTRL;
 
+    thread->counters[PMC48].id = 0;
+    westmereEX_PMunits[UBOX].ctrlRegister   = MSR_U_PMON_GLOBAL_CTRL;
+    westmereEX_PMunits[UBOX].statusRegister = MSR_U_PMON_GLOBAL_STATUS;
+    westmereEX_PMunits[UBOX].ovflRegister   = MSR_U_PMON_GLOBAL_OVF_CTRL;
+
+    /* Set IDs for all CBOXes */
+    int walker = 0;
+    for (int i=PMC49; i<=PMC98; i++)
+    {
+        thread->counters[i].id = walker;
+        walker = (walker == 4 ? 0 : walker + 1);
+    }
+    westmereEX_PMunits[CBOX0].ctrlRegister   = MSR_C0_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX0].statusRegister = MSR_C0_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX0].ovflRegister   = MSR_C0_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX1].ctrlRegister   = MSR_C1_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX1].statusRegister = MSR_C1_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX1].ovflRegister   = MSR_C1_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX2].ctrlRegister   = MSR_C2_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX2].statusRegister = MSR_C2_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX2].ovflRegister   = MSR_C2_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX3].ctrlRegister   = MSR_C3_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX3].statusRegister = MSR_C3_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX3].ovflRegister   = MSR_C3_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX4].ctrlRegister   = MSR_C4_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX4].statusRegister = MSR_C4_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX4].ovflRegister   = MSR_C4_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX5].ctrlRegister   = MSR_C5_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX5].statusRegister = MSR_C5_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX5].ovflRegister   = MSR_C5_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX6].ctrlRegister   = MSR_C6_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX6].statusRegister = MSR_C6_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX6].ovflRegister   = MSR_C6_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX7].ctrlRegister   = MSR_C7_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX7].statusRegister = MSR_C7_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX7].ovflRegister   = MSR_C7_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX8].ctrlRegister   = MSR_C8_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX8].statusRegister = MSR_C8_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX8].ovflRegister   = MSR_C8_PMON_BOX_OVF_CTRL;
+    westmereEX_PMunits[CBOX9].ctrlRegister   = MSR_C9_PMON_BOX_CTRL;
+    westmereEX_PMunits[CBOX9].statusRegister = MSR_C9_PMON_BOX_STATUS;
+    westmereEX_PMunits[CBOX9].ovflRegister   = MSR_C9_PMON_BOX_OVF_CTRL;
+
+    thread->counters[PMC99].id = 0;
+    thread->counters[PMC100].id = 1;
+    thread->counters[PMC101].id = 2;
+    thread->counters[PMC102].id = 3;
+    westmereEX_PMunits[SBOX0].ctrlRegister   = MSR_S0_PMON_BOX_CTRL;
+    westmereEX_PMunits[SBOX0].statusRegister = MSR_S0_PMON_BOX_STATUS;
+    westmereEX_PMunits[SBOX0].ovflRegister   = MSR_S0_PMON_BOX_OVF_CTRL;
+    thread->counters[PMC103].id = 0;
+    thread->counters[PMC104].id = 1;
+    thread->counters[PMC105].id = 2;
+    thread->counters[PMC106].id = 3;
+    westmereEX_PMunits[SBOX1].ctrlRegister   = MSR_S1_PMON_BOX_CTRL;
+    westmereEX_PMunits[SBOX1].statusRegister = MSR_S1_PMON_BOX_STATUS;
+    westmereEX_PMunits[SBOX1].ovflRegister   = MSR_S1_PMON_BOX_OVF_CTRL;
+
     if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
             lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
     {
@@ -209,6 +267,78 @@ void perfmon_init_westmereEX(PerfmonThread *thread)
         msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL14, 0x0ULL);
         msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL15, 0x0ULL);
 
+        msr_write(cpu_id, MSR_U_PMON_GLOBAL_EVNT_SEL, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL3, 0x0ULL);
+        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL4, 0x0ULL);
+
+        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL3, 0x0ULL);
+
+        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL0, 0x0ULL);
+        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL1, 0x0ULL);
+        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL2, 0x0ULL);
+        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL3, 0x0ULL);
+
         {
             uint32_t ubflags = 0x0UL;
             ubflags |= (1<<29); /* reset all */
@@ -217,205 +347,203 @@ void perfmon_init_westmereEX(PerfmonThread *thread)
     }
 }
 
-/* MBOX macros */
-
 #define MBOX_GATE(NUM)  \
     flags = 0x41ULL; \
 switch (event->cfgBits)  \
 {  \
     case 0x00:   /* primary Event */  \
-                                      flags |= (event->eventId<<9);  \
-    break;  \
+        flags |= (event->eventId<<9);  \
+        break;  \
     case 0x01: /* secondary Events */  \
-                                       /* TODO fvid index is missing defaults to 0 */   \
-    flags |= (1<<7); /* toggle flag mode */   \
-    flags |= (event->eventId<<19);   \
-    switch (event->eventId)   \
-    {   \
-        case 0x00: /* CYCLES_DSP_FILL: DSP */   \
-                                                {   \
-                                                    uint64_t dsp_flags = 0x0ULL;   \
-                                                    dsp_flags |= (event->umask<<7);  \
-                                                    msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags);   \
-                                                }   \
-        break;   \
-        case 0x01: /* CYCLES_SCHED_MODE: ISS */   \
-                                                  {   \
-                                                      uint32_t iss_flags = 0x0UL;   \
-                                                      iss_flags |= (event->umask<<4);   \
-                                                      msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-                                                  }    \
-        break;   \
-        case 0x05: /* CYCLES_PGT_STATE: PGT */   \
-                                                 {   \
-                                                     uint32_t pgt_flags = 0x0UL;   \
-                                                     pgt_flags |= (event->umask<<6);   \
-                                                     msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-                                                 }    \
-        break;   \
-        case 0x06: /* BCMD_SCHEDQ_OCCUPANCY: MAP */   \
-                                                      {   \
-                                                          uint32_t map_flags = 0x0UL;   \
-                                                          map_flags |= (event->umask<<6);   \
-                                                          msr_write(cpu_id, MSR_M##NUM##_PMON_MAP, map_flags);   \
-                                                      }   \
+        /* TODO fvid index is missing defaults to 0 */   \
+        flags |= (1<<7); /* toggle flag mode */   \
+        flags |= (event->eventId<<19);   \
+        switch (event->eventId)   \
+        {   \
+            case 0x00: /* CYCLES_DSP_FILL: DSP */   \
+                {   \
+                    uint64_t dsp_flags = 0x0ULL;   \
+                    dsp_flags |= (event->umask<<7);  \
+                    msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags);   \
+                }   \
+                break;   \
+            case 0x01: /* CYCLES_SCHED_MODE: ISS */   \
+                {   \
+                    uint32_t iss_flags = 0x0UL;   \
+                    iss_flags |= (event->umask<<4);   \
+                    msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
+                }    \
+                break;   \
+            case 0x05: /* CYCLES_PGT_STATE: PGT */   \
+                {   \
+                    uint32_t pgt_flags = 0x0UL;   \
+                    pgt_flags |= (event->umask<<6);   \
+                    msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
+                }    \
+                break;   \
+            case 0x06: /* BCMD_SCHEDQ_OCCUPANCY: MAP */   \
+                {   \
+                    uint32_t map_flags = 0x0UL;   \
+                    map_flags |= (event->umask<<6);   \
+                    msr_write(cpu_id, MSR_M##NUM##_PMON_MAP, map_flags);   \
+                }   \
+                break;   \
+        }    \
         break;   \
-    }    \
-    break;   \
     case 0x02: /* DRAM_CMD: PLD/ISS */   \
-                                         flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t pld_flags = 0x0UL;   \
-        uint32_t iss_flags = 0x0UL;   \
-        pld_flags |= (event->umask<<8);   \
-        if (event->cmask != 0)   \
+        flags |= (event->eventId<<9);   \
         {   \
-            iss_flags |= (event->cmask<<7);   \
-            pld_flags |= 1; /* toggle cmd flag */   \
+            uint32_t pld_flags = 0x0UL;   \
+            uint32_t iss_flags = 0x0UL;   \
+            pld_flags |= (event->umask<<8);   \
+            if (event->cmask != 0)   \
+            {   \
+                iss_flags |= (event->cmask<<7);   \
+                pld_flags |= 1; /* toggle cmd flag */   \
+            }   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags);   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
         }   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags);   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-    }   \
-    break;   \
+        break;   \
     case 0x03: /* DSP_FILL: DSP */   \
-                                     flags |= (event->eventId<<9);   \
-    {   \
-        uint64_t dsp_flags = 0x0ULL;   \
-        dsp_flags |= (event->umask<<7);   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags);   \
-    }   \
-    break;   \
+        flags |= (event->eventId<<9);   \
+        {   \
+            uint64_t dsp_flags = 0x0ULL;   \
+            dsp_flags |= (event->umask<<7);   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags);   \
+        }   \
+        break;   \
     case 0x04: /* DRAM_MISC: PLD */   \
-                                      flags |= (event->eventId<<9);   \
-    {   \
-        uint64_t pld_flags = 0x0ULL;   \
-        switch (event->cmask)   \
+        flags |= (event->eventId<<9);   \
         {   \
-            case 0x0:   \
-                        pld_flags |= (1<<16);   \
-            pld_flags |= (event->umask<<19);   \
-            break;   \
-            case 0x1:   \
-                        pld_flags |= (event->umask<<18);   \
-            break;   \
-            case 0x2:   \
-                        pld_flags |= (event->umask<<17);   \
-            break;   \
-            case 0x3:   \
-                        pld_flags |= (event->umask<<7);   \
-            break;   \
+            uint64_t pld_flags = 0x0ULL;   \
+            switch (event->cmask)   \
+            {   \
+                case 0x0:   \
+                            pld_flags |= (1<<16);   \
+                pld_flags |= (event->umask<<19);   \
+                break;   \
+                case 0x1:   \
+                            pld_flags |= (event->umask<<18);   \
+                break;   \
+                case 0x2:   \
+                            pld_flags |= (event->umask<<17);   \
+                break;   \
+                case 0x3:   \
+                            pld_flags |= (event->umask<<7);   \
+                break;   \
+            }   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags);   \
         }   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags);   \
-    }   \
-    break;   \
+        break;   \
     case 0x05: /* FRM_TYPE: ISS */   \
-                                     flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t iss_flags = 0x0UL;   \
-        iss_flags |= event->umask;   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-    }   \
-    break;   \
-    case 0x06: /* FVC_EV0: FVC */   \
-                                    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t fvc_flags = 0x0UL;   \
-        fvc_flags |= (event->umask<<12);   \
-        if (event->umask == 0x5)   \
+        flags |= (event->eventId<<9);   \
         {   \
-            fvc_flags |= (event->cmask<<6);   \
+            uint32_t iss_flags = 0x0UL;   \
+            iss_flags |= event->umask;   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
         }   \
-        else   \
+        break;   \
+    case 0x06: /* FVC_EV0: FVC */   \
+        flags |= (event->eventId<<9);   \
         {   \
-            fvc_flags |= (event->cmask<<9);   \
+            uint32_t fvc_flags = 0x0UL;   \
+            fvc_flags |= (event->umask<<12);   \
+            if (event->umask == 0x5)   \
+            {   \
+                fvc_flags |= (event->cmask<<6);   \
+            }   \
+            else   \
+            {   \
+                fvc_flags |= (event->cmask<<9);   \
+            }   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
+            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV0) \
         }   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-        VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV0) \
-    }   \
-    break;   \
+        break;   \
     case 0x07: /* FVC_EV1: FVC */   \
-                                    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t fvc_flags = 0x0UL;   \
-        fvc_flags |= (event->umask<<15);   \
-        if (event->umask == 0x5)   \
+        flags |= (event->eventId<<9);   \
         {   \
-            fvc_flags |= (event->cmask<<6);   \
+            uint32_t fvc_flags = 0x0UL;   \
+            fvc_flags |= (event->umask<<15);   \
+            if (event->umask == 0x5)   \
+            {   \
+                fvc_flags |= (event->cmask<<6);   \
+            }   \
+            else   \
+            {   \
+                fvc_flags |= (event->cmask<<9);   \
+            }   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
+            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV1) \
         }   \
-        else   \
-        {   \
-            fvc_flags |= (event->cmask<<9);   \
-        }   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-        VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV1) \
-    }   \
-    break;   \
+        break;   \
     case 0x08: /* FVC_EV2: FVC */   \
-                                    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t fvc_flags = 0x0UL;   \
-        fvc_flags |= (event->umask<<18);   \
-        if (event->umask == 0x5)   \
-        {   \
-            fvc_flags |= (event->cmask<<6);   \
-        }   \
-        else   \
+        flags |= (event->eventId<<9);   \
         {   \
-            fvc_flags |= (event->cmask<<9);   \
+            uint32_t fvc_flags = 0x0UL;   \
+            fvc_flags |= (event->umask<<18);   \
+            if (event->umask == 0x5)   \
+            {   \
+                fvc_flags |= (event->cmask<<6);   \
+            }   \
+            else   \
+            {   \
+                fvc_flags |= (event->cmask<<9);   \
+            }   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
+            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV2) \
         }   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-        VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV2) \
-    }   \
-    break;   \
+        break;   \
     case 0x09: /* FVC_EV3: FVC(ZDP) */   \
-                                         flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t fvc_flags = 0x0UL;   \
-        fvc_flags |= (event->umask<<21);   \
-        if (event->umask == 0x5)   \
+        flags |= (event->eventId<<9);   \
         {   \
-            fvc_flags |= (event->cmask<<6);   \
+            uint32_t fvc_flags = 0x0UL;   \
+            fvc_flags |= (event->umask<<21);   \
+            if (event->umask == 0x5)   \
+            {   \
+                fvc_flags |= (event->cmask<<6);   \
+            }   \
+            else   \
+            {   \
+                fvc_flags |= (event->cmask<<9);   \
+            }   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
         }   \
-        else   \
+        break;   \
+    case 0x0A: /* ISS_SCHED: ISS */   \
+        flags |= (event->eventId<<9);   \
         {   \
-            fvc_flags |= (event->cmask<<9);   \
+            uint32_t iss_flags = 0x0UL;   \
+            iss_flags |= (event->umask<<10);   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
         }   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-    }   \
-    break;   \
-    case 0x0A: /* ISS_SCHED: ISS */   \
-                                      flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t iss_flags = 0x0UL;   \
-        iss_flags |= (event->umask<<10);   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-    }   \
-    break;   \
+        break;   \
     case 0x0B: /* PGT_PAGE_EV: PGT */   \
-                                        flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t pgt_flags = 0x0UL;   \
-        pgt_flags |= event->umask;   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-    }   \
-    break;   \
+        flags |= (event->eventId<<9);   \
+        {   \
+            uint32_t pgt_flags = 0x0UL;   \
+            pgt_flags |= event->umask;   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
+        }   \
+        break;   \
     case 0x0C: /* PGT_PAGE_EV2: PGT */   \
-                                         flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t pgt_flags = 0x0UL;   \
-        pgt_flags |= (event->umask<<11);   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-    }   \
-    break;   \
+        flags |= (event->eventId<<9);   \
+        {   \
+            uint32_t pgt_flags = 0x0UL;   \
+            pgt_flags |= (event->umask<<11);   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
+        }   \
+        break;   \
     case 0x0D: /* THERM_TRP_DN: THR */   \
-                                         flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t thr_flags = 0x0UL;   \
-        thr_flags |= (1<<3);   \
-        thr_flags |= (event->umask<<9);   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, thr_flags);   \
-    }   \
-    break;   \
+        flags |= (event->eventId<<9);   \
+        {   \
+            uint32_t thr_flags = 0x0UL;   \
+            thr_flags |= (1<<3);   \
+            thr_flags |= (event->umask<<9);   \
+            msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, thr_flags);   \
+        }   \
+        break;   \
 }
 
 /* RBOX macros */
@@ -497,9 +625,10 @@ void perfmon_setupCounterThread_westmereEX(
         PerfmonCounterIndex index)
 {
     int haveLock = 0;
-    uint64_t flags = 0x0ULL;;
+    uint64_t flags = 0x0ULL;
     uint64_t reg = westmereEX_counter_map[index].configRegister;
     int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
     perfmon_threadData[thread_id].counters[index].init = TRUE;
 
     if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
@@ -510,8 +639,7 @@ void perfmon_setupCounterThread_westmereEX(
     switch (westmereEX_counter_map[index].type)
     {
         case PMC:
-            flags = msr_read(cpu_id,reg);
-            flags &= ~(0xFFFFU);   /* clear lower 16bits */
+            flags = (1<<22)|(1<<16);
 
             /* Intel with standard 8 bit event mask: [7:0] */
             flags |= (event->umask<<8) + event->eventId;
@@ -527,6 +655,8 @@ void perfmon_setupCounterThread_westmereEX(
                 break;
 
         case FIXED:
+            fixed_flags |= (0x2 <<(index*4));
+            msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
             break;
 
         case MBOX0:
@@ -594,6 +724,37 @@ void perfmon_setupCounterThread_westmereEX(
             }
             break;
 
+        case UBOX:
+            if (haveLock)
+            {
+                flags = 0x0ULL;
+                flags |= (1<<22);
+                flags |= (event->eventId);
+                msr_write(cpu_id, reg , flags);
+            }
+
+        case CBOX0:
+        case CBOX1:
+        case CBOX2:
+        case CBOX3:
+        case CBOX4:
+        case CBOX5:
+        case CBOX6:
+        case CBOX7:
+        case CBOX8:
+        case CBOX9:
+        case SBOX0:
+        case SBOX1:
+            if (haveLock)
+            {
+                flags = 0x0ULL;
+                flags |= (1<<22);
+                flags |= (event->umask<<8);
+                flags |= (event->eventId);
+                msr_write(cpu_id, reg , flags);
+            }
+            break;
+
         default:
             /* should never be reached */
             break;
@@ -621,6 +782,7 @@ void perfmon_startCountersThread_westmereEX(int thread_id)
     int haveLock = 0;
     uint64_t flags = 0x0ULL;
     uint32_t uflags[NUM_UNITS];
+    int enable_ubox = 0;
     int cpu_id = perfmon_threadData[thread_id].processorId;
 
     msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
@@ -659,6 +821,10 @@ void perfmon_startCountersThread_westmereEX(int thread_id)
                     msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
                     uflags[westmereEX_counter_map[i].type] |=
                         (1<<(perfmon_threadData[thread_id].counters[i].id));  /* enable uncore counter */
+                    if (westmereEX_counter_map[i].type == UBOX)
+                    {
+                        enable_ubox = 1;
+                    }
                 }
             }
         }
@@ -681,6 +847,10 @@ void perfmon_startCountersThread_westmereEX(int thread_id)
         /* set global enable flag in U BOX ctrl register */
         uint32_t ubflags = 0x0UL;
         ubflags |= (1<<28); /* enable all */
+        if (enable_ubox)
+        {
+            ubflags |= (1<<0);
+        }
         VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubflags, UBOX_GLOBAL_CTRL);
         msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
     }
diff --git a/src/includes/perfmon_westmereEX_counters.h b/src/includes/perfmon_westmereEX_counters.h
index 5d04b75..fd65746 100644
--- a/src/includes/perfmon_westmereEX_counters.h
+++ b/src/includes/perfmon_westmereEX_counters.h
@@ -5,8 +5,8 @@
  *
  *      Description: Counter Header File of perfmon module for Westmere EX.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -29,8 +29,8 @@
  */
 
 #define NUM_COUNTERS_CORE_WESTMEREEX 7
-#define NUM_COUNTERS_UNCORE_WESTMEREEX 48
-#define NUM_COUNTERS_WESTMEREEX 48
+#define NUM_COUNTERS_UNCORE_WESTMEREEX 107
+#define NUM_COUNTERS_WESTMEREEX 107
 
 static PerfmonCounterMap westmereEX_counter_map[NUM_COUNTERS_WESTMEREEX] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
@@ -86,6 +86,68 @@ static PerfmonCounterMap westmereEX_counter_map[NUM_COUNTERS_WESTMEREEX] = {
     {"WBOX1",PMC44, WBOX, MSR_W_PMON_EVNT_SEL1, MSR_W_PMON_CTR1, 0, 0},
     {"WBOX2",PMC45, WBOX, MSR_W_PMON_EVNT_SEL2, MSR_W_PMON_CTR2, 0, 0},
     {"WBOX3",PMC46, WBOX, MSR_W_PMON_EVNT_SEL3, MSR_W_PMON_CTR3, 0, 0},
-    {"WBOX4",PMC47, WBOX, MSR_W_PMON_FIXED_CTR_CTL, MSR_W_PMON_FIXED_CTR, 0, 0}
+    {"WBOX4",PMC47, WBOX, MSR_W_PMON_FIXED_CTR_CTL, MSR_W_PMON_FIXED_CTR, 0, 0},
+    /* UBOX */
+    {"UBOX0",PMC48, UBOX, MSR_U_PMON_GLOBAL_EVNT_SEL, MSR_U_PMON_GLOBAL_CTR, 0, 0},
+    /* CBOXes */
+    {"CBOX0C0",PMC49, CBOX0, MSR_C0_PMON_EVNT_SEL0, MSR_C0_PMON_CTR0, 0, 0},
+    {"CBOX0C1",PMC50, CBOX0, MSR_C0_PMON_EVNT_SEL1, MSR_C0_PMON_CTR1, 0, 0},
+    {"CBOX0C2",PMC51, CBOX0, MSR_C0_PMON_EVNT_SEL2, MSR_C0_PMON_CTR2, 0, 0},
+    {"CBOX0C3",PMC52, CBOX0, MSR_C0_PMON_EVNT_SEL3, MSR_C0_PMON_CTR3, 0, 0},
+    {"CBOX0C4",PMC53, CBOX0, MSR_C0_PMON_EVNT_SEL4, MSR_C0_PMON_CTR4, 0, 0},
+    {"CBOX1C0",PMC54, CBOX1, MSR_C1_PMON_EVNT_SEL0, MSR_C1_PMON_CTR0, 0, 0},
+    {"CBOX1C1",PMC55, CBOX1, MSR_C1_PMON_EVNT_SEL1, MSR_C1_PMON_CTR1, 0, 0},
+    {"CBOX1C2",PMC56, CBOX1, MSR_C1_PMON_EVNT_SEL2, MSR_C1_PMON_CTR2, 0, 0},
+    {"CBOX1C3",PMC57, CBOX1, MSR_C1_PMON_EVNT_SEL3, MSR_C1_PMON_CTR3, 0, 0},
+    {"CBOX1C4",PMC58, CBOX1, MSR_C1_PMON_EVNT_SEL4, MSR_C1_PMON_CTR4, 0, 0},
+    {"CBOX2C0",PMC59, CBOX2, MSR_C2_PMON_EVNT_SEL0, MSR_C2_PMON_CTR0, 0, 0},
+    {"CBOX2C1",PMC60, CBOX2, MSR_C2_PMON_EVNT_SEL1, MSR_C2_PMON_CTR1, 0, 0},
+    {"CBOX2C2",PMC61, CBOX2, MSR_C2_PMON_EVNT_SEL2, MSR_C2_PMON_CTR2, 0, 0},
+    {"CBOX2C3",PMC62, CBOX2, MSR_C2_PMON_EVNT_SEL3, MSR_C2_PMON_CTR3, 0, 0},
+    {"CBOX2C4",PMC63, CBOX2, MSR_C2_PMON_EVNT_SEL4, MSR_C2_PMON_CTR4, 0, 0},
+    {"CBOX3C0",PMC64, CBOX3, MSR_C3_PMON_EVNT_SEL0, MSR_C3_PMON_CTR0, 0, 0},
+    {"CBOX3C1",PMC65, CBOX3, MSR_C3_PMON_EVNT_SEL1, MSR_C3_PMON_CTR1, 0, 0},
+    {"CBOX3C2",PMC66, CBOX3, MSR_C3_PMON_EVNT_SEL2, MSR_C3_PMON_CTR2, 0, 0},
+    {"CBOX3C3",PMC67, CBOX3, MSR_C3_PMON_EVNT_SEL3, MSR_C3_PMON_CTR3, 0, 0},
+    {"CBOX3C4",PMC68, CBOX3, MSR_C3_PMON_EVNT_SEL4, MSR_C3_PMON_CTR4, 0, 0},
+    {"CBOX4C0",PMC69, CBOX4, MSR_C4_PMON_EVNT_SEL0, MSR_C4_PMON_CTR0, 0, 0},
+    {"CBOX4C1",PMC70, CBOX4, MSR_C4_PMON_EVNT_SEL1, MSR_C4_PMON_CTR1, 0, 0},
+    {"CBOX4C2",PMC71, CBOX4, MSR_C4_PMON_EVNT_SEL2, MSR_C4_PMON_CTR2, 0, 0},
+    {"CBOX4C3",PMC72, CBOX4, MSR_C4_PMON_EVNT_SEL3, MSR_C4_PMON_CTR3, 0, 0},
+    {"CBOX4C4",PMC73, CBOX4, MSR_C4_PMON_EVNT_SEL4, MSR_C4_PMON_CTR4, 0, 0},
+    {"CBOX5C0",PMC74, CBOX5, MSR_C5_PMON_EVNT_SEL0, MSR_C5_PMON_CTR0, 0, 0},
+    {"CBOX5C1",PMC75, CBOX5, MSR_C5_PMON_EVNT_SEL1, MSR_C5_PMON_CTR1, 0, 0},
+    {"CBOX5C2",PMC76, CBOX5, MSR_C5_PMON_EVNT_SEL2, MSR_C5_PMON_CTR2, 0, 0},
+    {"CBOX5C3",PMC77, CBOX5, MSR_C5_PMON_EVNT_SEL3, MSR_C5_PMON_CTR3, 0, 0},
+    {"CBOX5C4",PMC78, CBOX5, MSR_C5_PMON_EVNT_SEL4, MSR_C5_PMON_CTR4, 0, 0},
+    {"CBOX6C0",PMC79, CBOX6, MSR_C6_PMON_EVNT_SEL0, MSR_C6_PMON_CTR0, 0, 0},
+    {"CBOX6C1",PMC80, CBOX6, MSR_C6_PMON_EVNT_SEL1, MSR_C6_PMON_CTR1, 0, 0},
+    {"CBOX6C2",PMC81, CBOX6, MSR_C6_PMON_EVNT_SEL2, MSR_C6_PMON_CTR2, 0, 0},
+    {"CBOX6C3",PMC82, CBOX6, MSR_C6_PMON_EVNT_SEL3, MSR_C6_PMON_CTR3, 0, 0},
+    {"CBOX6C4",PMC83, CBOX6, MSR_C6_PMON_EVNT_SEL4, MSR_C6_PMON_CTR4, 0, 0},
+    {"CBOX7C0",PMC84, CBOX7, MSR_C7_PMON_EVNT_SEL0, MSR_C7_PMON_CTR0, 0, 0},
+    {"CBOX7C1",PMC85, CBOX7, MSR_C7_PMON_EVNT_SEL1, MSR_C7_PMON_CTR1, 0, 0},
+    {"CBOX7C2",PMC86, CBOX7, MSR_C7_PMON_EVNT_SEL2, MSR_C7_PMON_CTR2, 0, 0},
+    {"CBOX7C3",PMC87, CBOX7, MSR_C7_PMON_EVNT_SEL3, MSR_C7_PMON_CTR3, 0, 0},
+    {"CBOX7C4",PMC88, CBOX7, MSR_C7_PMON_EVNT_SEL4, MSR_C7_PMON_CTR4, 0, 0},
+    {"CBOX8C0",PMC89, CBOX8, MSR_C8_PMON_EVNT_SEL0, MSR_C8_PMON_CTR0, 0, 0},
+    {"CBOX8C1",PMC90, CBOX8, MSR_C8_PMON_EVNT_SEL1, MSR_C8_PMON_CTR1, 0, 0},
+    {"CBOX8C2",PMC91, CBOX8, MSR_C8_PMON_EVNT_SEL2, MSR_C8_PMON_CTR2, 0, 0},
+    {"CBOX8C3",PMC92, CBOX8, MSR_C8_PMON_EVNT_SEL3, MSR_C8_PMON_CTR3, 0, 0},
+    {"CBOX8C4",PMC93, CBOX8, MSR_C8_PMON_EVNT_SEL4, MSR_C8_PMON_CTR4, 0, 0},
+    {"CBOX9C0",PMC94, CBOX9, MSR_C9_PMON_EVNT_SEL0, MSR_C9_PMON_CTR0, 0, 0},
+    {"CBOX9C1",PMC95, CBOX9, MSR_C9_PMON_EVNT_SEL1, MSR_C9_PMON_CTR1, 0, 0},
+    {"CBOX9C2",PMC96, CBOX9, MSR_C9_PMON_EVNT_SEL2, MSR_C9_PMON_CTR2, 0, 0},
+    {"CBOX9C3",PMC97, CBOX9, MSR_C9_PMON_EVNT_SEL3, MSR_C9_PMON_CTR3, 0, 0},
+    {"CBOX9C4",PMC98, CBOX9, MSR_C9_PMON_EVNT_SEL4, MSR_C9_PMON_CTR4, 0, 0},
+    /* SBOXes */
+    {"SBOX0C0",PMC99 , SBOX0, MSR_S0_PMON_EVNT_SEL0, MSR_S0_PMON_CTR0, 0, 0},
+    {"SBOX0C1",PMC100, SBOX0, MSR_S0_PMON_EVNT_SEL1, MSR_S0_PMON_CTR1, 0, 0},
+    {"SBOX0C2",PMC101, SBOX0, MSR_S0_PMON_EVNT_SEL2, MSR_S0_PMON_CTR2, 0, 0},
+    {"SBOX0C3",PMC102, SBOX0, MSR_S0_PMON_EVNT_SEL3, MSR_S0_PMON_CTR3, 0, 0},
+    {"SBOX1C0",PMC103, SBOX1, MSR_S1_PMON_EVNT_SEL0, MSR_S1_PMON_CTR0, 0, 0},
+    {"SBOX1C1",PMC104, SBOX1, MSR_S1_PMON_EVNT_SEL1, MSR_S1_PMON_CTR1, 0, 0},
+    {"SBOX1C2",PMC105, SBOX1, MSR_S1_PMON_EVNT_SEL2, MSR_S1_PMON_CTR2, 0, 0},
+    {"SBOX1C3",PMC106, SBOX1, MSR_S1_PMON_EVNT_SEL3, MSR_S1_PMON_CTR3, 0, 0}
 };
 
diff --git a/src/includes/perfmon_westmereEX_events.txt b/src/includes/perfmon_westmereEX_events.txt
index aa17ce2..2aabf8d 100644
--- a/src/includes/perfmon_westmereEX_events.txt
+++ b/src/includes/perfmon_westmereEX_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for Intel WestmereEX
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
@@ -2751,6 +2751,639 @@ UMASK_CORRECTED_ERR                             0x00
 EVENT_UNCORRECTED_ERR                           0x1E5  UBOX
 UMASK_UNCORRECTED_ERR                           0x00
 
+EVENT_LLC_MISSES                                 0x14 CBOX
+UMASK_LLC_MISSES_SHARED                          0x01
+UMASK_LLC_MISSES_FORWARD                         0x02
+UMASK_LLC_MISSES_INVALID                         0x04
+UMASK_LLC_MISSES_ALL                             0x07
+
+EVENT_LLC_HITS                                   0x15 CBOX
+UMASK_LLC_HITS_MODIFIED                          0x01
+UMASK_LLC_HITS_EXCLUSIVE                         0x02
+UMASK_LLC_HITS_SHARED                            0x04
+UMASK_LLC_HITS_FORWARD                           0x08
+UMASK_LLC_HITS_ALL                               0x0F
+
+EVENT_LLC_S_FILLS                               0x16 CBOX
+UMASK_LLC_S_FILLS_MODIFIED                      0x01
+UMASK_LLC_S_FILLS_EXCLUSIVE                     0x02
+UMASK_LLC_S_FILLS_SHARED                        0x04
+UMASK_LLC_S_FILLS_FORWARD                       0x08
+UMASK_LLC_S_FILLS_ALL                           0x0F
+
+EVENT_LLC_VICTIMS                               0x17 CBOX
+UMASK_LLC_VICTIMS_MODIFIED                      0x01
+UMASK_LLC_VICTIMS_EXCLUSIVE                     0x02
+UMASK_LLC_VICTIMS_SHARED                        0x04
+UMASK_LLC_VICTIMS_FORWARD                       0x08
+UMASK_LLC_VICTIMS_ALL                           0x0F
+UMASK_LLC_VICTIMS_FILL_WITHOUT_VICTIMS          0x10
+
+EVENT_ARB_LOSSES                                 0x0A CBOX
+UMASK_ARB_LOSSES_AD_SB                           0x01
+UMASK_ARB_LOSSES_AD_NSB                          0x02
+UMASK_ARB_LOSSES_AD_ALL                          0x03
+UMASK_ARB_LOSSES_AK_SB                           0x04
+UMASK_ARB_LOSSES_AK_NSB                          0x08
+UMASK_ARB_LOSSES_AK_ALL                          0x0C
+UMASK_ARB_LOSSES_BL_SB                           0x10
+UMASK_ARB_LOSSES_BL_NSB                          0x20
+UMASK_ARB_LOSSES_BL_ALL                          0x30
+UMASK_ARB_LOSSES_IV                              0x40
+UMASK_ARB_LOSSES_ALL                             0x7F
+
+EVENT_ARB_WINS                                 0x0A CBOX
+UMASK_ARB_WINS_AD_SB                           0x01
+UMASK_ARB_WINS_AD_NSB                          0x02
+UMASK_ARB_WINS_AD_ALL                          0x03
+UMASK_ARB_WINS_AK_SB                           0x04
+UMASK_ARB_WINS_AK_NSB                          0x08
+UMASK_ARB_WINS_AK_ALL                          0x0C
+UMASK_ARB_WINS_BL_SB                           0x10
+UMASK_ARB_WINS_BL_NSB                          0x20
+UMASK_ARB_WINS_BL_ALL                          0x30
+UMASK_ARB_WINS_IV                              0x40
+UMASK_ARB_WINS_ALL                             0x7F
+
+EVENT_ARB_WINS_P2C_NSB                         0x34 CBOX
+UMASK_ARB_WINS_P2C_NSB                         0x00
+
+EVENT_ARB_WINS_P2C_SB                         0x33 CBOX
+UMASK_ARB_WINS_P2C_SB                         0x00
+
+EVENT_BOUNCE_ASSERT                             0x38 CBOX
+UMASK_BOUNCE_ASSERT                             0x00
+
+EVENT_BOUNCE_DEASSERT                             0x39 CBOX
+UMASK_BOUNCE_DEASSERT                             0x00
+
+EVENT_BOUNCES_C2P_AK                            0x02 CBOX
+UMASK_BOUNCES_C2P_AK_SB                         0x01
+UMASK_BOUNCES_C2P_AK_NSB                        0x02
+UMASK_BOUNCES_C2P_AK_ALL                        0x03
+
+EVENT_BOUNCES_C2P_BL                            0x03 CBOX
+UMASK_BOUNCES_C2P_BL_SB                         0x01
+UMASK_BOUNCES_C2P_BL_NSB                        0x02
+UMASK_BOUNCES_C2P_BL_ALL                        0x03
+
+EVENT_BOUNCES_C2P_AD                            0x01 CBOX
+UMASK_BOUNCES_C2P_AD_SB                         0x01
+UMASK_BOUNCES_C2P_AD_NSB                        0x02
+UMASK_BOUNCES_C2P_AD_ALL                        0x03
+
+EVENT_BOUNCES_C2P_IV                            0x04 CBOX
+UMASK_BOUNCES_C2P_IV                            0x00
+
+EVENT_EGRESS_BYPASS_WINS                        0x0C CBOX
+UMASK_EGRESS_BYPASS_WINS_AD_BYP0                0x01
+UMASK_EGRESS_BYPASS_WINS_AD_BYP1                0x02
+UMASK_EGRESS_BYPASS_WINS_AK_BYP0                0x04
+UMASK_EGRESS_BYPASS_WINS_AK_BYP1                0x08
+UMASK_EGRESS_BYPASS_WINS_BL_BYP0                0x10
+UMASK_EGRESS_BYPASS_WINS_BL_BYP1                0x20
+UMASK_EGRESS_BYPASS_WINS_IV_BYP0                0x40
+UMASK_EGRESS_BYPASS_WINS_IV_BYP1                0x80
+
+EVENT_INGRESS_BYPASS_WINS_AD                    0x0E CBOX
+UMASK_INGRESS_BYPASS_WINS_AD_IRQ_BYP0           0x01
+UMASK_INGRESS_BYPASS_WINS_AD_IRQ_BYP1           0x02
+UMASK_INGRESS_BYPASS_WINS_AD_IPQ_BYP0           0x04
+UMASK_INGRESS_BYPASS_WINS_AD_IPQ_BYP1           0x08
+
+EVENT_IDF_NONZERO_NO_BL_CRD                     0x36 CBOX
+UMASK_IDF_NONZERO_NO_BL_CRD                     0x00
+
+EVENT_IDF_NONZERO_NO_VLD                        0x37 CBOX
+UMASK_IDF_NONZERO_NO_VLD                        0x00
+
+EVENT_IGR_BID_BLOCKED                        0x3C CBOX
+UMASK_IGR_BID_BLOCKED                        0x00
+
+EVENT_IGR_OP_SRAM                            0x31 CBOX
+UMASK_IGR_OP_SRAM                            0x00
+
+EVENT_IGR_OP_UC                              0x32 CBOX
+UMASK_IGR_OP_UC                              0x00
+
+EVENT_MAF_ACK                                0x10 CBOX
+UMASK_MAF_ACK                                0x00
+
+EVENT_MAF_NACK1                                0x11 CBOX
+UMASK_MAF_NACK1_GO_PENDING                     0x01
+UMASK_MAF_NACK1_VIC_PENDING                    0x02
+UMASK_MAF_NACK1_SNP_PENDING                    0x04
+UMASK_MAF_NACK1_AC_PENDING                     0x08
+UMASK_MAF_NACK1_IDX_BLOCK                      0x10
+UMASK_MAF_NACK1_PA_BLOCK                       0x20
+UMASK_MAF_NACK1_IDLE_QPI                       0x40
+UMASK_MAF_NACK1_ALL_MAF_NACK1                  0x80
+UMASK_MAF_NACK1_TOTAL_MAF_NACKS                0xFF
+
+EVENT_MAF_NACK2                                0x12 CBOX
+UMASK_MAF_NACK2_MAF_FULL                       0x01
+UMASK_MAF_NACK2_EGRESS_FULL                    0x02
+UMASK_MAF_NACK2_VIQ_FULL                       0x04
+UMASK_MAF_NACK2_NO_TRACKER_CREDITS             0x08
+UMASK_MAF_NACK2_NO_S_FIFO_CREDITS              0x10
+UMASK_MAF_NACK2_NO_S_REQTBL_ENTRIES            0x20
+UMASK_MAF_NACK2_WB_PENDING                     0x40
+UMASK_MAF_NACK2_NACK2_ELSE                     0x80
+
+EVENT_OCCUPANCY_IPQ                            0x1A CBOX
+UMASK_OCCUPANCY_IPQ                            0x00
+
+EVENT_OCCUPANCY_IRQ                            0x18 CBOX
+UMASK_OCCUPANCY_IRQ                            0x00
+
+EVENT_OCCUPANCY_MAF                            0x1E CBOX
+UMASK_OCCUPANCY_MAF                            0x00
+
+EVENT_OCCUPANCY_RSPF                           0x22 CBOX
+UMASK_OCCUPANCY_RSPF                           0x00
+
+EVENT_OCCUPANCY_RWRF                           0x20 CBOX
+UMASK_OCCUPANCY_RWRF                           0x00
+
+EVENT_OCCUPANCY_VIQ                            0x1C CBOX
+UMASK_OCCUPANCY_VIQ                            0x00
+
+EVENT_SINKS_C2P                                0x06 CBOX
+UMASK_SINKS_C2P_IV                             0x01
+UMASK_SINKS_C2P_AK                             0x02
+UMASK_SINKS_C2P_BL                             0x04
+
+EVENT_SINKS_P2C                                0x05 CBOX
+UMASK_SINKS_P2C_IV                             0x01
+UMASK_SINKS_P2C_AK                             0x02
+UMASK_SINKS_P2C_BL                             0x04
+
+EVENT_SINKS_S2C                                0x07 CBOX
+UMASK_SINKS_S2C_AD                             0x01
+UMASK_SINKS_S2C_AK                             0x02
+UMASK_SINKS_S2C_BL                             0x04
+
+EVENT_SINKS_S2P_BL                            0x08 CBOX
+UMASK_SINKS_S2P_BL                            0x00
+
+EVENT_SNP_HITS                                  0x28 CBOX
+UMASK_SNP_HITS_REMOTE_RD_HITM                   0x01
+UMASK_SNP_HITS_REMOTE_RD_HITE                   0x02
+UMASK_SNP_HITS_REMOTE_RD_HITS                   0x04
+UMASK_SNP_HITS_REMOTE_RD_HITF                   0x08
+UMASK_SNP_HITS_REMOTE_RFO_HITM                  0x10
+UMASK_SNP_HITS_REMOTE_RFO_HITE                  0x20
+UMASK_SNP_HITS_REMOTE_RFO_HITS                  0x40
+UMASK_SNP_HITS_REMOTE_RFO_HITF                  0x80
+UMASK_SNP_HITS_REMOTE_HITM                      0x11
+UMASK_SNP_HITS_REMOTE_HITE                      0x22
+UMASK_SNP_HITS_REMOTE_HITS                      0x44
+UMASK_SNP_HITS_REMOTE_HITF                      0x88
+UMASK_SNP_HITS_REMOTE_ANY                       0xFF
+
+EVENT_SNPS                                      0x27 CBOX
+UMASK_SNPS_REMOTE_RD                            0x01
+UMASK_SNPS_REMOTE_RFO                           0x02
+UMASK_SNPS_REMOTE_ANY                           0x03
+
+EVENT_SPL_ARB_PRI_SW                            0x2A CBOX
+UMASK_SPL_ARB_PRI_SW                            0x00
+
+EVENT_SPL_CO_SB                                0x2C CBOX
+UMASK_SPL_CO_SB                                0x00
+
+EVENT_SPL_CO_NSB                                0x2D CBOX
+UMASK_SPL_CO_NSB                                0x00
+
+EVENT_SPL_DEAD                                  0x29 CBOX
+UMASK_SPL_DEAD                                  0x00
+
+EVENT_SPL_EGR_SB                               0x2F CBOX
+UMASK_SPL_EGR_SB                               0x00
+
+EVENT_SPL_EGR_NSB                               0x30 CBOX
+UMASK_SPL_EGR_NSB                               0x00
+
+EVENT_SPL_IN_FULL_IRQ                            0x2E CBOX
+UMASK_SPL_IN_FULL_IRQ                            0x00
+
+EVENT_SPL_NOT_CO                                0x2B CBOX
+UMASK_SPL_NOT_CO                                0x00
+
+EVENT_SPOOF_ASSERT                               0x3A CBOX
+UMASK_SPOOF_ASSERT                               0x00
+
+EVENT_SPOOF_DEASSERT                            0x3B CBOX
+UMASK_SPOOF_DEASSERT                            0x00
+
+EVENT_SPOOF_CRD_EMPTY                           0x35 CBOX
+UMASK_SPOOF_CRD_EMPTY                           0x00
+
+EVENT_STARVED_EGRESS                            0x0B CBOX
+UMASK_STARVED_EGRESS_P2C_AD_SB                  0x01
+UMASK_STARVED_EGRESS_C2P_AD_SB                  0x02
+UMASK_STARVED_EGRESS_AD_SB                      0x03
+UMASK_STARVED_EGRESS_AD_NSB                     0x04
+UMASK_STARVED_EGRESS_AD                         0x07
+UMASK_STARVED_EGRESS_AK_SB                      0x08
+UMASK_STARVED_EGRESS_AK_NSB                     0x10
+UMASK_STARVED_EGRESS_AK                         0x18
+UMASK_STARVED_EGRESS_BL_SB                      0x20
+UMASK_STARVED_EGRESS_BL_NSB                     0x40
+UMASK_STARVED_EGRESS_BL                         0x60
+UMASK_STARVED_EGRESS_IV                         0x80
+
+EVENT_TRANS_IPQ                                 0x1B CBOX
+UMASK_TRANS_IPQ                                 0x00
+
+EVENT_TRANS_IRQ                                 0x19 CBOX
+UMASK_TRANS_IRQ                                 0x00
+
+EVENT_TRANS_MAF                                 0x1F CBOX
+UMASK_TRANS_MAF                                 0x00
+
+EVENT_TRANS_RSPF                                0x23 CBOX
+UMASK_TRANS_RSPF                                0x00
+
+EVENT_TRANS_RWRF                                0x21 CBOX
+UMASK_TRANS_RWRF                                0x00
+
+EVENT_TRANS_VIQ                                 0x1D CBOX
+UMASK_TRANS_VIQ                                 0x00
+
+EVENT_TO_R_PROG_EV                              0x00 SBOX
+UMASK_TO_R_PROG_EV                              0x00
+
+EVENT_TO_R_B_HOM_MSGQ_CYCLES_FULL               0x03 SBOX
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_FULL_RBOX          0x01
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_FULL_BBOX          0x02
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_FULL_ALL           0x03
+
+EVENT_TO_R_B_HOM_MSGQ_CYCLES_NE                 0x06 SBOX
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_NE_RBOX            0x01
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_NE_BBOX            0x02
+UMASK_TO_R_B_HOM_MSGQ_CYCLES_NE_ALL             0x03
+
+EVENT_TO_R_B_HOM_MSGQ_OCCUPANCY                 0x07 SBOX
+UMASK_TO_R_B_HOM_MSGQ_OCCUPANCY_RBOX            0x01
+UMASK_TO_R_B_HOM_MSGQ_OCCUPANCY_BBOX            0x02
+UMASK_TO_R_B_HOM_MSGQ_OCCUPANCY_ALL             0x03
+
+EVENT_TO_R_SNP_MSGQ_CYCLES_FULL                 0x08 SBOX
+UMASK_TO_R_SNP_MSGQ_CYCLES_FULL                 0x00
+
+EVENT_TO_R_SNP_MSGQ_CYCLES_NE                   0x09 SBOX
+UMASK_TO_R_SNP_MSGQ_CYCLES_NE                   0x00
+
+EVENT_TO_R_SNP_MSGQ_OCCUPANCY                   0x0A SBOX
+UMASK_TO_R_SNP_MSGQ_OCCUPANCY                   0x00
+
+EVENT_TO_R_NDR_MSGQ_CYCLES_FULL                 0x0B SBOX
+UMASK_TO_R_NDR_MSGQ_CYCLES_FULL                 0x00
+
+EVENT_TO_R_NDR_MSGQ_CYCLES_NE                   0x0C SBOX
+UMASK_TO_R_NDR_MSGQ_CYCLES_NE                   0x00
+
+EVENT_TO_R_NDR_MSGQ_OCCUPANCY                   0x0D SBOX
+UMASK_TO_R_NDR_MSGQ_OCCUPANCY                   0x00
+
+EVENT_TO_R_DRS_MSGQ_CYCLES_FULL                 0x0E SBOX
+UMASK_TO_R_DRS_MSGQ_CYCLES_FULL_CBOX0_5         0x01
+UMASK_TO_R_DRS_MSGQ_CYCLES_FULL_CBOX1_6         0x02
+UMASK_TO_R_DRS_MSGQ_CYCLES_FULL_CBOX2_7         0x04
+UMASK_TO_R_DRS_MSGQ_CYCLES_FULL_CBOX3_8         0x08
+UMASK_TO_R_DRS_MSGQ_CYCLES_FULL_CBOX4_9         0x10
+UMASK_TO_R_DRS_MSGQ_CYCLES_FULL_ALL             0x1F
+
+EVENT_TO_R_DRS_MSGQ_CYCLES_NE                   0x0F SBOX
+UMASK_TO_R_DRS_MSGQ_CYCLES_NE_CBOX0_5           0x01
+UMASK_TO_R_DRS_MSGQ_CYCLES_NE_CBOX1_6           0x02
+UMASK_TO_R_DRS_MSGQ_CYCLES_NE_CBOX2_7           0x04
+UMASK_TO_R_DRS_MSGQ_CYCLES_NE_CBOX3_8           0x08
+UMASK_TO_R_DRS_MSGQ_CYCLES_NE_CBOX4_9           0x10
+UMASK_TO_R_DRS_MSGQ_CYCLES_NE_ALL               0x1F
+
+EVENT_TO_R_DRS_MSGQ_OCCUPANCY                   0x10 SBOX
+UMASK_TO_R_DRS_MSGQ_OCCUPANCY_CBOX0_5           0x01
+UMASK_TO_R_DRS_MSGQ_OCCUPANCY_CBOX1_6           0x02
+UMASK_TO_R_DRS_MSGQ_OCCUPANCY_CBOX2_7           0x04
+UMASK_TO_R_DRS_MSGQ_OCCUPANCY_CBOX3_8           0x08
+UMASK_TO_R_DRS_MSGQ_OCCUPANCY_CBOX4_9           0x10
+UMASK_TO_R_DRS_MSGQ_OCCUPANCY_ALL               0x1F
+
+EVENT_TO_R_NCB_MSGQ_CYCLES_FULL                 0x11 SBOX
+UMASK_TO_R_NCB_MSGQ_CYCLES_FULL_CBOX0_5         0x01
+UMASK_TO_R_NCB_MSGQ_CYCLES_FULL_CBOX1_6         0x02
+UMASK_TO_R_NCB_MSGQ_CYCLES_FULL_CBOX2_7         0x04
+UMASK_TO_R_NCB_MSGQ_CYCLES_FULL_CBOX3_8         0x08
+UMASK_TO_R_NCB_MSGQ_CYCLES_FULL_CBOX4_9         0x10
+UMASK_TO_R_NCB_MSGQ_CYCLES_FULL_ALL             0x1F
+
+EVENT_TO_R_NCB_MSGQ_CYCLES_NE                   0x12 SBOX
+UMASK_TO_R_NCB_MSGQ_CYCLES_NE_CBOX0_5           0x01
+UMASK_TO_R_NCB_MSGQ_CYCLES_NE_CBOX1_6           0x02
+UMASK_TO_R_NCB_MSGQ_CYCLES_NE_CBOX2_7           0x04
+UMASK_TO_R_NCB_MSGQ_CYCLES_NE_CBOX3_8           0x08
+UMASK_TO_R_NCB_MSGQ_CYCLES_NE_CBOX4_9           0x10
+UMASK_TO_R_NCB_MSGQ_CYCLES_NE_ALL               0x1F
+
+EVENT_TO_R_NCB_MSGQ_OCCUPANCY                   0x13 SBOX
+UMASK_TO_R_NCB_MSGQ_OCCUPANCY_CBOX0_5           0x01
+UMASK_TO_R_NCB_MSGQ_OCCUPANCY_CBOX1_6           0x02
+UMASK_TO_R_NCB_MSGQ_OCCUPANCY_CBOX2_7           0x04
+UMASK_TO_R_NCB_MSGQ_OCCUPANCY_CBOX3_8           0x08
+UMASK_TO_R_NCB_MSGQ_OCCUPANCY_CBOX4_9           0x10
+UMASK_TO_R_NCB_MSGQ_OCCUPANCY_ALL               0x1F
+
+EVENT_TO_R_NCS_MSGQ_CYCLES_FULL                 0x14 SBOX
+UMASK_TO_R_NCS_MSGQ_CYCLES_FULL_CBOX0_5         0x01
+UMASK_TO_R_NCS_MSGQ_CYCLES_FULL_CBOX1_6         0x02
+UMASK_TO_R_NCS_MSGQ_CYCLES_FULL_CBOX2_7         0x04
+UMASK_TO_R_NCS_MSGQ_CYCLES_FULL_CBOX3_8         0x08
+UMASK_TO_R_NCS_MSGQ_CYCLES_FULL_CBOX4_9         0x10
+UMASK_TO_R_NCS_MSGQ_CYCLES_FULL_ALL             0x1F
+
+EVENT_TO_R_NCS_MSGQ_CYCLES_NE                   0x15 SBOX
+UMASK_TO_R_NCS_MSGQ_CYCLES_NE_CBOX0_5           0x01
+UMASK_TO_R_NCS_MSGQ_CYCLES_NE_CBOX1_6           0x02
+UMASK_TO_R_NCS_MSGQ_CYCLES_NE_CBOX2_7           0x04
+UMASK_TO_R_NCS_MSGQ_CYCLES_NE_CBOX3_8           0x08
+UMASK_TO_R_NCS_MSGQ_CYCLES_NE_CBOX4_9           0x10
+UMASK_TO_R_NCS_MSGQ_CYCLES_NE_ALL               0x1F
+
+EVENT_TO_R_NCS_MSGQ_OCCUPANCY                   0x16 SBOX
+UMASK_TO_R_NCS_MSGQ_OCCUPANCY_CBOX0_5           0x01
+UMASK_TO_R_NCS_MSGQ_OCCUPANCY_CBOX1_6           0x02
+UMASK_TO_R_NCS_MSGQ_OCCUPANCY_CBOX2_7           0x04
+UMASK_TO_R_NCS_MSGQ_OCCUPANCY_CBOX3_8           0x08
+UMASK_TO_R_NCS_MSGQ_OCCUPANCY_CBOX4_9           0x10
+UMASK_TO_R_NCS_MSGQ_OCCUPANCY_ALL               0x1F
+
+EVENT_TO_RING_SNP_MSGQ_CYCLES_FULL              0x20 SBOX
+UMASK_TO_RING_SNP_MSGQ_CYCLES_FULL              0x00
+
+EVENT_TO_RING_NCB_MSGQ_CYCLES_FULL              0x21 SBOX
+UMASK_TO_RING_NCB_MSGQ_CYCLES_FULL              0x00
+
+EVENT_TO_RING_NCS_MSGQ_CYCLES_FULL              0x22 SBOX
+UMASK_TO_RING_NCS_MSGQ_CYCLES_FULL              0x00
+
+EVENT_TO_RING_SNP_MSGQ_CYCLES_NE                0x23 SBOX
+UMASK_TO_RING_SNP_MSGQ_CYCLES_NE                0x00
+
+EVENT_TO_RING_NCB_MSGQ_CYCLES_NE                0x24 SBOX
+UMASK_TO_RING_NCB_MSGQ_CYCLES_NE                0x00
+
+EVENT_TO_RING_NCS_MSGQ_CYCLES_NE                0x25 SBOX
+UMASK_TO_RING_NCS_MSGQ_CYCLES_NE                0x00
+
+EVENT_TO_RING_MSGQ_OCCUPANCY                    0x26 SBOX
+UMASK_TO_RING_MSGQ_OCCUPANCY_SNP                0x01
+UMASK_TO_RING_MSGQ_OCCUPANCY_NCS                0x02
+UMASK_TO_RING_MSGQ_OCCUPANCY_NCB                0x04
+UMASK_TO_RING_MSGQ_OCCUPANCY_ALL                0x07
+
+EVENT_TO_RING_NDR_MSGQ_CYCLES_FULL              0x27 SBOX
+UMASK_TO_RING_NDR_MSGQ_CYCLES_FULL              0x00
+
+EVENT_TO_RING_NDR_MSGQ_CYCLES_NE                0x28 SBOX
+UMASK_TO_RING_NDR_MSGQ_CYCLES_NE                0x00
+
+EVENT_TO_RING_NDR_MSGQ_OCCUPANCY                0x29 SBOX
+UMASK_TO_RING_NDR_MSGQ_OCCUPANCY                0x00
+
+EVENT_TO_RING_R2S_MSGQ_CYCLES_FULL              0x2A SBOX
+UMASK_TO_RING_R2S_MSGQ_CYCLES_FULL              0x00
+
+EVENT_TO_RING_R2S_MSGQ_CYCLES_NE                0x2B SBOX
+UMASK_TO_RING_R2S_MSGQ_CYCLES_NE                0x00
+
+EVENT_TO_RING_R2S_MSGQ_OCCUPANCY                0x2C SBOX
+UMASK_TO_RING_R2S_MSGQ_OCCUPANCY                0x00
+
+EVENT_TO_RING_B2S_MSGQ_CYCLES_FULL              0x2D SBOX
+UMASK_TO_RING_B2S_MSGQ_CYCLES_FULL              0x00
+
+EVENT_TO_RING_B2S_MSGQ_CYCLES_NE                0x2E SBOX
+UMASK_TO_RING_B2S_MSGQ_CYCLES_NE                0x00
+
+EVENT_TO_RING_B2S_MSGQ_OCCUPANCY                0x2F SBOX
+UMASK_TO_RING_B2S_MSGQ_OCCUPANCY                0x00
+
+EVENT_HALFLINE_BYPASS                           0x30 SBOX
+UMASK_HALFLINE_BYPASS                           0x00
+
+EVENT_REQ_TBL_OCCUPANCY                         0x31 SBOX
+UMASK_REQ_TBL_OCCUPANCY_LOCAL                   0x01
+UMASK_REQ_TBL_OCCUPANCY_REMOTE                  0x02
+UMASK_REQ_TBL_OCCUPANCY_ALL                     0x03
+
+EVENT_EGRESS_BYPASS                             0x40 SBOX
+UMASK_EGRESS_BYPASS_AD_CW                      0x01
+UMASK_EGRESS_BYPASS_AD_CCW                     0x02
+UMASK_EGRESS_BYPASS_AD                         0x03
+UMASK_EGRESS_BYPASS_AK_CW                      0x04
+UMASK_EGRESS_BYPASS_AK_CCW                     0x08
+UMASK_EGRESS_BYPASS_AK                         0x0C
+UMASK_EGRESS_BYPASS_BL_CW                      0x10
+UMASK_EGRESS_BYPASS_BL_CCW                     0x20
+UMASK_EGRESS_BYPASS_BL                         0x30
+
+EVENT_EGRESS_ARB_WINS                           0x41 SBOX
+UMASK_EGRESS_ARB_WINS_AD_CW                   0x01
+UMASK_EGRESS_ARB_WINS_AD_CCW                  0x02
+UMASK_EGRESS_ARB_WINS_AD                      0x03
+UMASK_EGRESS_ARB_WINS_AK_CW                   0x04
+UMASK_EGRESS_ARB_WINS_AK_CCW                  0x08
+UMASK_EGRESS_ARB_WINS_AK                      0x0C
+UMASK_EGRESS_ARB_WINS_BL_CW                   0x10
+UMASK_EGRESS_ARB_WINS_BL_CCW                  0x20
+UMASK_EGRESS_ARB_WINS_BL                      0x30
+
+EVENT_EGRESS_ARB_LOSSES                         0x42 SBOX
+UMASK_EGRESS_ARB_LOSSES_AD_CW                   0x01
+UMASK_EGRESS_ARB_LOSSES_AD_CCW                  0x02
+UMASK_EGRESS_ARB_LOSSES_AD                      0x03
+UMASK_EGRESS_ARB_LOSSES_AK_CW                   0x04
+UMASK_EGRESS_ARB_LOSSES_AK_CCW                  0x08
+UMASK_EGRESS_ARB_LOSSES_AK                      0x0C
+UMASK_EGRESS_ARB_LOSSES_BL_CW                   0x10
+UMASK_EGRESS_ARB_LOSSES_BL_CCW                  0x20
+UMASK_EGRESS_ARB_LOSSES_BL                      0x30
+
+EVENT_EGRESS_STARVED                            0x43 SBOX
+UMASK_EGRESS_STARVED_AD_CW                      0x01
+UMASK_EGRESS_STARVED_AD_CCW                     0x02
+UMASK_EGRESS_STARVED_AD                         0x03
+UMASK_EGRESS_STARVED_AK_CW                      0x04
+UMASK_EGRESS_STARVED_AK_CCW                     0x08
+UMASK_EGRESS_STARVED_AK                         0x0C
+UMASK_EGRESS_STARVED_BL_CW                      0x10
+UMASK_EGRESS_STARVED_BL_CCW                     0x20
+UMASK_EGRESS_STARVED_BL                         0x30
+
+EVENT_RBOX_HOM_BYPASS                           0x50 SBOX
+UMASK_RBOX_HOM_BYPASS                           0x00
+
+EVENT_RBOX_SNP_BYPASS                           0x51 SBOX
+UMASK_RBOX_SNP_BYPASS_SNP                       0x01
+UMASK_RBOX_SNP_BYPASS_BIG_SNP                   0x02
+UMASK_RBOX_SNP_BYPASS_ALL                       0x03
+
+EVENT_S2B_HOM_BYPASS                            0x52 SBOX
+UMASK_S2B_HOM_BYPASS                            0x00
+
+EVENT_B2S_DRS_BYPASS                            0x53 SBOX
+UMASK_B2S_DRS_BYPASS                            0x00
+
+EVENT_BBOX_HOM_BYPASS                           0x54 SBOX
+UMASK_BBOX_HOM_BYPASS                           0x00
+
+EVENT_PKTS_SENT_HOM                             0x60 SBOX
+UMASK_PKTS_SENT_HOM_RBOX                        0x01
+UMASK_PKTS_SENT_HOM_BBOX                        0x02
+UMASK_PKTS_SENT_HOM_ALL                         0x03
+
+EVENT_PKTS_SENT_SNP                             0x62 SBOX
+UMASK_PKTS_SENT_SNP                             0x00
+
+EVENT_PKTS_RCVD_SNP                             0x71 SBOX
+UMASK_PKTS_RCVD_SNP                             0x00
+
+EVENT_PKTS_SENT_NDR                             0x63 SBOX
+UMASK_PKTS_SENT_NDR                             0x00
+
+EVENT_PKTS_RCVD_NDR                             0x70 SBOX
+UMASK_PKTS_RCVD_NDR                             0x00
+
+EVENT_PKTS_SENT_DRS                             0x64 SBOX
+UMASK_PKTS_SENT_DRS_CBOX0_5                     0x01
+UMASK_PKTS_SENT_DRS_CBOX1_6                     0x02
+UMASK_PKTS_SENT_DRS_CBOX2_7                     0x04
+UMASK_PKTS_SENT_DRS_CBOX3_8                     0x08
+UMASK_PKTS_SENT_DRS_CBOX4_9                     0x10
+UMASK_PKTS_SENT_DRS_ALL                         0x1F
+
+EVENT_FLITS_SENT_DRS                            0x65 SBOX
+UMASK_FLITS_SENT_DRS                            0x00
+
+EVENT_PKTS_RCVD_DRS_FROM_R                      0x72 SBOX
+UMASK_PKTS_RCVD_DRS_FROM_R                      0x00
+
+EVENT_PKTS_RCVD_DRS_FROM_B                      0x73 SBOX
+UMASK_PKTS_RCVD_DRS_FROM_B                      0x00
+
+EVENT_PKTS_SENT_NCS                             0x66 SBOX
+UMASK_PKTS_SENT_NCS_CBOX0_5                     0x01
+UMASK_PKTS_SENT_NCS_CBOX1_6                     0x02
+UMASK_PKTS_SENT_NCS_CBOX2_7                     0x04
+UMASK_PKTS_SENT_NCS_CBOX3_8                     0x08
+UMASK_PKTS_SENT_NCS_CBOX4_9                     0x10
+UMASK_PKTS_SENT_NCS_ALL                         0x1F
+
+EVENT_FLITS_SENT_NCS                            0x67 SBOX
+UMASK_FLITS_SENT_NCS                            0x00
+
+EVENT_PKTS_RCVD_NCS                             0x74 SBOX
+UMASK_PKTS_RCVD_NCS                             0x00
+
+EVENT_PKTS_SENT_NCB                             0x68 SBOX
+UMASK_PKTS_SENT_NCB_CBOX0_5                     0x01
+UMASK_PKTS_SENT_NCB_CBOX1_6                     0x02
+UMASK_PKTS_SENT_NCB_CBOX2_7                     0x04
+UMASK_PKTS_SENT_NCB_CBOX3_8                     0x08
+UMASK_PKTS_SENT_NCB_CBOX4_9                     0x10
+UMASK_PKTS_SENT_NCB_ALL                         0x1F
+
+EVENT_FLITS_SENT_NCB                            0x69 SBOX
+UMASK_FLITS_SENT_NCB                            0x00
+
+EVENT_PKTS_RCVD_NCB                             0x75 SBOX
+UMASK_PKTS_RCVD_NCB                             0x00
+
+EVENT_FLITS_SENT_LOC_NCS                        0x90 SBOX
+UMASK_FLITS_SENT_LOC_NCS                        0x00
+
+EVENT_PKTS_RCVD_LOC_NCS                         0x8F SBOX
+UMASK_PKTS_RCVD_LOC_NCS                         0x00
+
+EVENT_RBOX_CREDIT_RETURNS                       0x6A SBOX
+UMASK_RBOX_CREDIT_RETURNS                       0x00
+
+EVENT_BBOX_CREDIT_RETURNS                       0x6B SBOX
+UMASK_BBOX_CREDIT_RETURNS                       0x00
+
+EVENT_TO_R_B_REQUESTS                           0x6C SBOX
+UMASK_TO_R_B_REQUESTS_LOCAL                     0x01
+UMASK_TO_R_B_REQUESTS_REMOTE                    0x02
+UMASK_TO_R_B_REQUESTS_ALL                       0x03
 
+EVENT_RBOX_CREDITS                              0x76 SBOX
+UMASK_RBOX_CREDITS                              0x00
+
+EVENT_BBOX_CREDITS                              0x77 SBOX
+UMASK_BBOX_CREDITS                              0x00
+
+EVENT_NO_CREDIT_HOM                             0x80 SBOX
+UMASK_NO_CREDIT_HOM                             0x00
+
+EVENT_NO_CREDIT_SNP                             0x81 SBOX
+UMASK_NO_CREDIT_SNP                             0x00
+
+EVENT_NO_CREDIT_DRS                             0x82 SBOX
+UMASK_NO_CREDIT_DRS                             0x00
+
+EVENT_NO_CREDIT_NCS                             0x83 SBOX
+UMASK_NO_CREDIT_NCS                             0x00
+
+EVENT_NO_CREDIT_NCB                             0x84 SBOX
+UMASK_NO_CREDIT_NCB                             0x00
+
+EVENT_NO_CREDIT_NDR                             0x85 SBOX
+UMASK_NO_CREDIT_NDR                             0x00
+
+EVENT_NO_CREDIT_VNA                             0x86 SBOX
+UMASK_NO_CREDIT_VNA_RBOX                        0x01
+UMASK_NO_CREDIT_VNA_BBOX                        0x02
+UMASK_NO_CREDIT_VNA_ALL                         0x03
+
+EVENT_NO_CREDIT_AD                              0x87 SBOX
+UMASK_NO_CREDIT_AD                              0x00
+
+EVENT_NO_CREDIT_AK                              0x88 SBOX
+UMASK_NO_CREDIT_AK                              0x00
+
+EVENT_NO_CREDIT_BL                              0x89 SBOX
+UMASK_NO_CREDIT_BL                              0x00
+
+EVENT_NO_CREDIT_IPQ                             0x8A SBOX
+UMASK_NO_CREDIT_IPQ                             0x00
+
+EVENT_NO_CREDIT_LOC_NCS                         0x8B SBOX
+UMASK_NO_CREDIT_LOC_NCS                         0x00
+
+EVENT_TO_R_LOC_NCS_MSGQ_CYCLES_FULL             0x8C SBOX
+UMASK_TO_R_LOC_NCS_MSGQ_CYCLES_FULL_CBOX0_5     0x01
+UMASK_TO_R_LOC_NCS_MSGQ_CYCLES_FULL_CBOX1_6     0x02
+UMASK_TO_R_LOC_NCS_MSGQ_CYCLES_FULL_CBOX2_7     0x04
+UMASK_TO_R_LOC_NCS_MSGQ_CYCLES_FULL_CBOX3_8     0x08
+UMASK_TO_R_LOC_NCS_MSGQ_CYCLES_FULL_CBOX4_9     0x10
+UMASK_TO_R_LOC_NCS_MSGQ_CYCLES_FULL_ALL         0x1F
+
+EVENT_TO_R_LOC_NCS_MSGQ_CYCLES_NE               0x8D SBOX
+UMASK_TO_R_LOC_NCS_MSGQ_CYCLES_NE_CBOX0_5       0x01
+UMASK_TO_R_LOC_NCS_MSGQ_CYCLES_NE_CBOX1_6       0x02
+UMASK_TO_R_LOC_NCS_MSGQ_CYCLES_NE_CBOX2_7       0x04
+UMASK_TO_R_LOC_NCS_MSGQ_CYCLES_NE_CBOX3_8       0x08
+UMASK_TO_R_LOC_NCS_MSGQ_CYCLES_NE_CBOX4_9       0x10
+UMASK_TO_R_LOC_NCS_MSGQ_CYCLES_NE_ALL           0x1F
+
+EVENT_TO_R_LOC_NCS_MSGQ_OCCUPANCY               0x8E SBOX
+UMASK_TO_R_LOC_NCS_MSGQ_OCCUPANCY_CBOX0_5       0x01
+UMASK_TO_R_LOC_NCS_MSGQ_OCCUPANCY_CBOX1_6       0x02
+UMASK_TO_R_LOC_NCS_MSGQ_OCCUPANCY_CBOX2_7       0x04
+UMASK_TO_R_LOC_NCS_MSGQ_OCCUPANCY_CBOX3_8       0x08
+UMASK_TO_R_LOC_NCS_MSGQ_OCCUPANCY_CBOX4_9       0x10
+UMASK_TO_R_LOC_NCS_MSGQ_OCCUPANCY_ALL           0x1F
 
 
diff --git a/src/includes/perfmon_westmere_events.txt b/src/includes/perfmon_westmere_events.txt
index 94a4dea..3c3e66f 100644
--- a/src/includes/perfmon_westmere_events.txt
+++ b/src/includes/perfmon_westmere_events.txt
@@ -4,8 +4,8 @@
 # 
 #      Description:  Event list for Intel Westmere
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
diff --git a/src/includes/power.h b/src/includes/power.h
index efe84dc..6cb5fd3 100644
--- a/src/includes/power.h
+++ b/src/includes/power.h
@@ -6,8 +6,8 @@
  *      Description:  Header File Power Module
  *                    Implements Intel RAPL Interface.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/power_types.h b/src/includes/power_types.h
index c503d06..b53ce85 100644
--- a/src/includes/power_types.h
+++ b/src/includes/power_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for power module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/registers.h b/src/includes/registers.h
index 2b812d0..ae80e28 100644
--- a/src/includes/registers.h
+++ b/src/includes/registers.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Register Defines for the perfmon module
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -61,13 +61,14 @@
 #define MSR_PERF_GLOBAL_OVF_CTRL  0x390
 #define MSR_PEBS_ENABLE           0x3F1
 /* Perfmon V3 */
-#define MSR_OFFCORE_RSP0              0x1A6
+#define MSR_OFFCORE_RESP0              0x1A6
+#define MSR_OFFCORE_RESP1              0x1A7
 #define MSR_UNCORE_PERF_GLOBAL_CTRL       0x391
 #define MSR_UNCORE_PERF_GLOBAL_STATUS     0x392
 #define MSR_UNCORE_PERF_GLOBAL_OVF_CTRL   0x393
 #define MSR_UNCORE_FIXED_CTR0             0x394  /* Uncore clock cycles */
 #define MSR_UNCORE_FIXED_CTR_CTRL         0x395 /*FIXME: Is this correct? */
-#define MSR_UNCORE_ADDR_OPCODE_MATCH      0x396 
+#define MSR_UNCORE_ADDR_OPCODE_MATCH      0x396
 #define MSR_UNCORE_PERFEVTSEL0         0x3C0
 #define MSR_UNCORE_PERFEVTSEL1         0x3C1
 #define MSR_UNCORE_PERFEVTSEL2         0x3C2
@@ -84,7 +85,36 @@
 #define MSR_UNCORE_PMC5                0x3B5
 #define MSR_UNCORE_PMC6                0x3B6
 #define MSR_UNCORE_PMC7                0x3B7
-
+/*
+ * Perfmon V4 (starting with Haswell, according to
+ * Intel software developers guide also for SandyBridge,
+ * IvyBridge not mentioned in this section)
+ */
+#define MSR_UNC_PERF_GLOBAL_CTRL       MSR_UNCORE_PERF_GLOBAL_CTRL
+#define MSR_UNC_PERF_GLOBAL_STATUS     MSR_UNCORE_PERF_GLOBAL_STATUS
+#define MSR_UNC_PERF_FIXED_CTRL        MSR_UNCORE_FIXED_CTR0
+#define MSR_UNC_PERF_FIXED_CTR         MSR_UNCORE_FIXED_CTR_CTRL
+#define MSR_UNC_ARB_PERFEVTSEL0        MSR_UNCORE_PMC2
+#define MSR_UNC_ARB_PERFEVTSEL1        MSR_UNCORE_PMC3
+#define MSR_UNC_ARB_CTR0               MSR_UNCORE_PMC0
+#define MSR_UNC_ARB_CTR1               MSR_UNCORE_PMC1
+#define MSR_UNC_CBO_CONFIG             0x396
+#define MSR_UNC_CBO_0_PERFEVTSEL0      0x700
+#define MSR_UNC_CBO_0_PERFEVTSEL1      0x701
+#define MSR_UNC_CBO_0_CTR0             0x706
+#define MSR_UNC_CBO_0_CTR1             0x707
+#define MSR_UNC_CBO_1_PERFEVTSEL0      0x710
+#define MSR_UNC_CBO_1_PERFEVTSEL1      0x711
+#define MSR_UNC_CBO_1_CTR0             0x716
+#define MSR_UNC_CBO_1_CTR1             0x717
+#define MSR_UNC_CBO_2_PERFEVTSEL0      0x720
+#define MSR_UNC_CBO_2_PERFEVTSEL1      0x721
+#define MSR_UNC_CBO_2_CTR0             0x726
+#define MSR_UNC_CBO_2_CTR1             0x727
+#define MSR_UNC_CBO_3_PERFEVTSEL0      0x730
+#define MSR_UNC_CBO_3_PERFEVTSEL1      0x731
+#define MSR_UNC_CBO_3_CTR0             0x736
+#define MSR_UNC_CBO_3_CTR1             0x737
 /* Xeon Phi */
 #define MSR_MIC_TSC                   0x010
 #define MSR_MIC_PERFEVTSEL0           0x028
@@ -92,9 +122,9 @@
 #define MSR_MIC_PMC0                  0x020
 #define MSR_MIC_PMC1                  0x021
 #define MSR_MIC_SPFLT_CONTROL         0x02C
-#define MSR_MIC_PERF_GLOBAL_STATUS    0x02D 
-#define MSR_MIC_PERF_GLOBAL_OVF_CTRL  0x02E 
-#define MSR_MIC_PERF_GLOBAL_CTRL      0x02F 
+#define MSR_MIC_PERF_GLOBAL_STATUS    0x02D
+#define MSR_MIC_PERF_GLOBAL_OVF_CTRL  0x02E
+#define MSR_MIC_PERF_GLOBAL_CTRL      0x02F
 
 
 /* Core v1/v2 type uncore
@@ -304,6 +334,10 @@
 #define MSR_UNC_U_PMON_CTL1             0xC11
 #define MSR_UNC_U_UCLK_FIXED_CTR        0xC09
 #define MSR_UNC_U_UCLK_FIXED_CTL        0xC08
+#define MSR_UNC_U_PMON_BOX_STATUS       0xC15
+#define MSR_UNC_U_PMON_GLOBAL_STATUS    0xC01
+#define MSR_UNC_U_PMON_GLOBAL_CTL       0xC00
+#define MSR_UNC_U_PMON_GLOBAL_CONFIG    0xC06
 
 /* HA Box Performance Monitoring */
 
@@ -641,6 +675,38 @@
 #define MSR_C7_PMON_CTR4                0xDF9
 #define MSR_C7_PMON_EVNT_SEL5           0xDFA
 #define MSR_C7_PMON_CTR5                0xDFB
+/* C box 8 - Coherence Engine core 8 */
+#define MSR_C8_PMON_BOX_CTRL            0xF40
+#define MSR_C8_PMON_BOX_STATUS          0xF41
+#define MSR_C8_PMON_BOX_OVF_CTRL        0xF42
+#define MSR_C8_PMON_EVNT_SEL0           0xF50
+#define MSR_C8_PMON_CTR0                0xF51
+#define MSR_C8_PMON_EVNT_SEL1           0xF52
+#define MSR_C8_PMON_CTR1                0xF53
+#define MSR_C8_PMON_EVNT_SEL2           0xF54
+#define MSR_C8_PMON_CTR2                0xF55
+#define MSR_C8_PMON_EVNT_SEL3           0xF56
+#define MSR_C8_PMON_CTR3                0xF57
+#define MSR_C8_PMON_EVNT_SEL4           0xF58
+#define MSR_C8_PMON_CTR4                0xF59
+#define MSR_C8_PMON_EVNT_SEL5           0xF5A
+#define MSR_C8_PMON_CTR5                0xF5B
+/* C box 9 - Coherence Engine core 9 */
+#define MSR_C9_PMON_BOX_CTRL            0xFC0
+#define MSR_C9_PMON_BOX_STATUS          0xFC1
+#define MSR_C9_PMON_BOX_OVF_CTRL        0xFC2
+#define MSR_C9_PMON_EVNT_SEL0           0xFD0
+#define MSR_C9_PMON_CTR0                0xFD1
+#define MSR_C9_PMON_EVNT_SEL1           0xFD2
+#define MSR_C9_PMON_CTR1                0xFD3
+#define MSR_C9_PMON_EVNT_SEL2           0xFD4
+#define MSR_C9_PMON_CTR2                0xFD5
+#define MSR_C9_PMON_EVNT_SEL3           0xFD6
+#define MSR_C9_PMON_CTR3                0xFD7
+#define MSR_C9_PMON_EVNT_SEL4           0xFD8
+#define MSR_C9_PMON_CTR4                0xFD9
+#define MSR_C9_PMON_EVNT_SEL5           0xFDA
+#define MSR_C9_PMON_CTR5                0xFDB
 /* R box 0 - Router 0 */
 #define MSR_R0_PMON_BOX_CTRL            0xE00
 #define MSR_R0_PMON_BOX_STATUS          0xE01
@@ -749,6 +815,8 @@
 #define MSR_TURBO_POWER_CURRENT_LIMIT   0x1AC
 #define MSR_TURBO_RATIO_LIMIT           0x1AD
 
+/* Intel Silvermont's RAPL registers */
+#define MSR_PKG_POWER_INFO_SILVERMONT   0x66E
 /*
  * AMD
  */
diff --git a/src/includes/strUtil.h b/src/includes/strUtil.h
index 5fad5df..18236b6 100644
--- a/src/includes/strUtil.h
+++ b/src/includes/strUtil.h
@@ -6,8 +6,8 @@
  *      Description:  Header File strUtil Module. 
  *                    Helper routines for bstrlib and command line parsing
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -37,7 +37,7 @@
 #include <time.h>
 
 #define CHECK_OPTION_STRING  \
-if (! (argString = bSecureInput(200,optarg))) {  \
+if (! (argString = bSecureInput(400,optarg))) {  \
     ERROR_PLAIN_PRINT(Failed to read argument string!);  \
 }
 
diff --git a/src/includes/strUtil_types.h b/src/includes/strUtil_types.h
index 4dec99e..25766ff 100644
--- a/src/includes/strUtil_types.h
+++ b/src/includes/strUtil_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for strUtil module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/test_types.h b/src/includes/test_types.h
index e08fffa..45c0932 100644
--- a/src/includes/test_types.h
+++ b/src/includes/test_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Type definitions for benchmarking framework
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -38,7 +38,10 @@ typedef void (*FuncPrototype)();
 
 typedef enum {
     SINGLE = 0,
-    DOUBLE} DataType;
+    DOUBLE,
+    SINGLE_RAND,
+    DOUBLE_RAND
+} DataType;
 
 typedef enum {
     STREAM_1 = 1,
@@ -79,7 +82,8 @@ typedef enum {
     STREAM_36,
     STREAM_37,
     STREAM_38,
-    MAX_STREAMS} Pattern;
+    MAX_STREAMS
+} Pattern;
 
 typedef struct {
     char* name;
@@ -87,15 +91,15 @@ typedef struct {
     DataType type ;
     int stride;
     FuncPrototype kernel;
-    int  flops;
-    int  bytes;
+    double flops;
+    int bytes;
 } TestCase;
 
 typedef struct {
-    uint64_t   size;
-    uint32_t   iter;
+    uint64_t size;
+    uint32_t iter;
     const TestCase* test;
-    uint64_t   cycles;
+    uint64_t cycles;
     uint32_t numberOfThreads;
     int* processors;
     void** streams;
diff --git a/src/includes/textcolor.h b/src/includes/textcolor.h
index 976a0a0..4c1b7b1 100644
--- a/src/includes/textcolor.h
+++ b/src/includes/textcolor.h
@@ -7,8 +7,8 @@
  *                    Allows toggling of terminal escape sequences for 
  *                    colored text.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/thermal.h b/src/includes/thermal.h
index f104aa1..3153386 100644
--- a/src/includes/thermal.h
+++ b/src/includes/thermal.h
@@ -6,8 +6,8 @@
  *      Description:  Header File Thermal Module.
  *                    Implements Intel TM/TM2 Interface.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -44,7 +44,10 @@ static inline uint32_t thermal_read(int cpuId);
 static uint32_t
 thermal_read(int cpuId)
 {
-    return (thermal_info.activationT - extractBitField(msr_read(cpuId, IA32_THERM_STATUS),7,16));
+    uint32_t readout = extractBitField(msr_read(cpuId, IA32_THERM_STATUS),7,16);
+    return (readout == 0 ? 
+            thermal_info.activationT - thermal_info.offset :
+            (thermal_info.activationT-thermal_info.offset) - readout );
 }
 
 #endif /*THERMAL_H*/
diff --git a/src/includes/thermal_types.h b/src/includes/thermal_types.h
index 71da84d..a619180 100644
--- a/src/includes/thermal_types.h
+++ b/src/includes/thermal_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for thermal module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -38,6 +38,7 @@ typedef struct {
     uint16_t highT;
     uint32_t resolution;
     uint32_t activationT;
+    uint32_t offset;
 } ThermalInfo;
 
 
diff --git a/src/includes/threads.h b/src/includes/threads.h
index 2db402c..6e00191 100644
--- a/src/includes/threads.h
+++ b/src/includes/threads.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Header file of pthread interface module
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -34,6 +34,7 @@
 #include <types.h>
 #include <pthread.h>
 #include <threads_types.h>
+#include <stdio.h>
 
 #define THREADS_BARRIER pthread_barrier_wait(&threads_barrier)
 
@@ -46,7 +47,7 @@ extern ThreadGroup* threads_groups;
  * @brief  Initialization of the thread module
  * @param  numberOfThreads  The total number of threads
  */
-extern void threads_init(int numberOfThreads);
+extern void threads_init(FILE* OUTSTREAM, int numberOfThreads);
 
 /**
  * @brief  Create all threads
diff --git a/src/includes/threads_types.h b/src/includes/threads_types.h
index 783cbfd..dfa13f3 100644
--- a/src/includes/threads_types.h
+++ b/src/includes/threads_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for threads module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -31,23 +31,25 @@
 #ifndef THREADS_TYPES_H
 #define THREADS_TYPES_H
 
+#include <stdio.h>
 #include <stdint.h>
 
 typedef struct {
-    int        globalNumberOfThreads;
-    int        numberOfThreads;
-    int        globalThreadId;
-    int        threadId;
-    int        numberOfGroups;
-    int        groupId;
-    double      time;
-    uint64_t   cycles;
+    int globalNumberOfThreads;
+    int numberOfThreads;
+    int globalThreadId;
+    int threadId;
+    int numberOfGroups;
+    int groupId;
+    double time;
+    uint64_t cycles;
+    FILE* output;
     ThreadUserData data;
 } ThreadData;
 
 typedef struct {
-    int        numberOfThreads;
-    int*       threadIds;
+    int numberOfThreads;
+    int* threadIds;
 } ThreadGroup;
 
 typedef void (*threads_copyDataFunc)(ThreadUserData* src,ThreadUserData* dst);
diff --git a/src/includes/timer.h b/src/includes/timer.h
index 77da7c3..b97f4ac 100644
--- a/src/includes/timer.h
+++ b/src/includes/timer.h
@@ -10,8 +10,8 @@
  *      with rdtsc of 100 cycles in the worst case. Therefore sensible
  *      measurements should be over 1000 cycles.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -38,30 +38,28 @@
 #include <types.h>
 
 #define RDTSC(cpu_c) \
-__asm__ volatile("xor %%eax,%%eax\n\t"           \
-"cpuid\n\t"           \
-"rdtsc\n\t"           \
-"movl %%eax, %0\n\t"  \
-"movl %%edx, %1\n\t"  \
-: "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
-: : "%eax","%ebx","%ecx","%edx")
+    __asm__ volatile("xor %%eax,%%eax\n\t" \
+                     "cpuid\n\t"           \
+                     "rdtsc\n\t"           \
+                     "movl %%eax, %0\n\t"  \
+                     "movl %%edx, %1\n\t"  \
+    : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
+    : : "%eax","%ebx","%ecx","%edx")
 
 #define RDTSC_CR(cpu_c) \
-__asm__ volatile(   \
-"rdtsc\n\t"           \
-"movl %%eax, %0\n\t"  \
-"movl %%edx, %1\n\t"  \
-: "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
-: : "%eax","%ebx","%ecx","%edx")
+    __asm__ volatile("rdtsc\n\t"           \
+                     "movl %%eax, %0\n\t"  \
+                     "movl %%edx, %1\n\t"  \
+    : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
+    : : "%eax","%ebx","%ecx","%edx")
 
 #define RDTSCP(cpu_c) \
-__asm__ volatile(     \
-"rdtscp\n\t"          \
-"movl %%eax, %0\n\t"  \
-"movl %%edx, %1\n\t"  \
-"cpuid\n\t"           \
-: "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
-: : "%eax","%ebx","%ecx","%edx")
+    __asm__ volatile("rdtscp\n\t"          \
+                     "movl %%eax, %0\n\t"  \
+                     "movl %%edx, %1\n\t"  \
+                     "cpuid\n\t"           \
+    : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
+    : : "%eax","%ebx","%ecx","%edx")
 
 #ifdef HAS_RDTSCP
 #define RDTSC_STOP(cpu_c) RDTSCP(cpu_c);
@@ -85,15 +83,15 @@ void timer_start( TimerData* time )
     RDTSC(time->start);
 #endif
 #ifdef _ARCH_PPC
-	uint32_t tbl, tbu0, tbu1;
+    uint32_t tbl, tbu0, tbu1;
 
-	do {
-		__asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
-		__asm__ __volatile__ ("mftb %0" : "=r"(tbl));
-		__asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
-	} while (tbu0 != tbu1);
+    do {
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+        __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+    } while (tbu0 != tbu1);
 
-	time->start.int64 = (((uint64_t)tbu0) << 32) | tbl;
+    time->start.int64 = (((uint64_t)tbu0) << 32) | tbl;
 #endif
 }
 
@@ -103,14 +101,14 @@ void timer_stop( TimerData* time )
     RDTSC_STOP(time->stop)
 #endif
 #ifdef _ARCH_PPC
-	uint32_t tbl, tbu0, tbu1;
-	do {
-		__asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
-		__asm__ __volatile__ ("mftb %0" : "=r"(tbl));
-		__asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
-	} while (tbu0 != tbu1);
+    uint32_t tbl, tbu0, tbu1;
+    do {
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+        __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+    } while (tbu0 != tbu1);
 
-	time->stop.int64 = (((uint64_t)tbu0) << 32) | tbl;
+    time->stop.int64 = (((uint64_t)tbu0) << 32) | tbl;
 #endif
 }
 
diff --git a/src/includes/timer_types.h b/src/includes/timer_types.h
index 4437881..265d5c9 100644
--- a/src/includes/timer_types.h
+++ b/src/includes/timer_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for timer module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/tree.h b/src/includes/tree.h
index 428e1ca..9816cf7 100644
--- a/src/includes/tree.h
+++ b/src/includes/tree.h
@@ -6,8 +6,8 @@
  *      Description:  Header File tree Module. 
  *                    Implements a simple tree data structure.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/tree_types.h b/src/includes/tree_types.h
index 6593a91..b449e39 100644
--- a/src/includes/tree_types.h
+++ b/src/includes/tree_types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Types file for tree module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/types.h b/src/includes/types.h
index 6d99813..2b0745a 100644
--- a/src/includes/types.h
+++ b/src/includes/types.h
@@ -5,8 +5,8 @@
  *
  *      Description:  Global  Types file
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/libperfctr.c b/src/libperfctr.c
index 97b3e4d..a4b2158 100644
--- a/src/libperfctr.c
+++ b/src/libperfctr.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Marker API interface of module perfmon
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -69,6 +69,7 @@
 #include <perfmon_sandybridge_counters.h>
 #include <perfmon_ivybridge_counters.h>
 #include <perfmon_westmereEX_counters.h>
+#include <perfmon_silvermont_counters.h>
 
 
 /* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
@@ -91,32 +92,31 @@ static BitMask counterMask;
 
 void str2BitMask(const char* str, BitMask* mask)
 {
-  char* endptr;
-  errno = 0;
-  struct bstrList* tokens;
-  bstring q = bfromcstralloc (60, str);
-  tokens = bsplit(q,' ');
-
-  for (int i=0; i<tokens->qty; i++)
-  {
-      uint64_t val =  strtoull((char*) tokens->entry[i]->data, &endptr, 16);
-
-      if ((errno == ERANGE && val == LONG_MAX )
-              || (errno != 0 && val == 0))
-      {
-          ERROR;
-      }
-
-      if (endptr == str)
-      {
-          ERROR_PLAIN_PRINT(No digits were found);
-      }
-
-      mask->mask[i] = val;
-  }
-
-  bstrListDestroy(tokens);
-  bdestroy(q);
+    char* endptr;
+    errno = 0;
+    struct bstrList* tokens;
+    bstring q = bfromcstralloc (60, str);
+    tokens = bsplit(q,' ');
+
+    for (int i=0; i<tokens->qty; i++)
+    {
+        uint64_t val =  strtoull((char*) tokens->entry[i]->data, &endptr, 16);
+
+        if ((errno == ERANGE && val == LONG_MAX ) || (errno != 0 && val == 0))
+        {
+            ERROR;
+        }
+
+        if (endptr == str)
+        {
+            ERROR_PLAIN_PRINT(No digits were found);
+        }
+
+        mask->mask[i] = val;
+    }
+
+    bstrListDestroy(tokens);
+    bdestroy(q);
 }
 
 static int getProcessorID(cpu_set_t* cpu_set)
@@ -204,6 +204,17 @@ void likwid_markerInit(void)
                     perfmon_numCountersCore = NUM_COUNTERS_CORE_CORE2;
                     break;
 
+                case ATOM_SILVERMONT_C:
+                case ATOM_SILVERMONT_E:
+                case ATOM_SILVERMONT_F1:
+                case ATOM_SILVERMONT_F2:
+                case ATOM_SILVERMONT_F3:
+                    power_init(0);
+                    perfmon_counter_map = silvermont_counter_map;
+                    perfmon_numCounters = NUM_COUNTERS_SILVERMONT;
+                    perfmon_numCountersCore = NUM_COUNTERS_CORE_SILVERMONT;
+                    break;
+
                 case CORE_DUO:
                     ERROR_PLAIN_PRINT(Unsupported Processor);
                     break;
@@ -681,8 +692,16 @@ void likwid_markerStopRegion(const char* regionTag)
             {
                 if ( perfmon_counter_map[i].type == POWER )
                 {
-                    results->PMcounters[i] += power_info.energyUnit *
-                        (PMcounters[i] - results->StartPMcounters[i]);
+                    if (PMcounters[i] >= results->StartPMcounters[i])
+                    {
+                        results->PMcounters[i] += power_info.energyUnit *
+                            (PMcounters[i] - results->StartPMcounters[i]);
+                    }
+                    else
+                    {
+                        results->PMcounters[i] += power_info.energyUnit *
+                            (((double)0xFFFFFFFF) - results->StartPMcounters[i] + PMcounters[i]);
+                    }
                 }
                 else
                 {
diff --git a/src/likwid.f90 b/src/likwid.f90
index b4a3c12..1215dd4 100644
--- a/src/likwid.f90
+++ b/src/likwid.f90
@@ -4,8 +4,8 @@
 !
 !     Description: Marker API f90 module
 !
-!      Version:   3.1.2
-!      Released:  2.6.2014
+!      Version:   3.1.3
+!      Released:  4.11.2014
 !
 !     Author:  Jan Treibig (jt), jan.treibig at gmail.com
 !     Project:  likwid
@@ -32,22 +32,22 @@ module likwid
 
 interface
 
-  subroutine likwid_markerInit()
-  end subroutine likwid_markerInit
+    subroutine likwid_markerInit()
+    end subroutine likwid_markerInit
 
-  subroutine likwid_markerThreadInit()
-  end subroutine likwid_markerThreadInit
+    subroutine likwid_markerThreadInit()
+    end subroutine likwid_markerThreadInit
 
-  subroutine likwid_markerClose()
-  end subroutine likwid_markerClose
+    subroutine likwid_markerClose()
+    end subroutine likwid_markerClose
 
-  subroutine likwid_markerStartRegion( regionTag )
-  character(*) :: regionTag
-  end subroutine likwid_markerStartRegion
+    subroutine likwid_markerStartRegion( regionTag )
+    character(*) :: regionTag
+    end subroutine likwid_markerStartRegion
 
-  subroutine likwid_markerStopRegion( regionTag )
-  character(*) :: regionTag
-  end subroutine likwid_markerStopRegion
+    subroutine likwid_markerStopRegion( regionTag )
+    character(*) :: regionTag
+    end subroutine likwid_markerStopRegion
 
 end interface
 
diff --git a/src/likwid_f90_interface.c b/src/likwid_f90_interface.c
index cc6ea5d..31bad92 100644
--- a/src/likwid_f90_interface.c
+++ b/src/likwid_f90_interface.c
@@ -5,8 +5,8 @@
  *
  *      Description: F90 interface for marker API
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -62,7 +62,7 @@ void likwid_markerstartregion_(char* regionTag, int len)
     }
 
     likwid_markerStartRegion( tmp );
-	free(tmp);
+    free(tmp);
 }
 
 void likwid_markerstopregion_(char* regionTag, int len)
@@ -79,6 +79,6 @@ void likwid_markerstopregion_(char* regionTag, int len)
     }
 
     likwid_markerStopRegion( tmp );
-	free(tmp);
+    free(tmp);
 }
 
diff --git a/src/memsweep.c b/src/memsweep.c
index 1af4b5e..8abf796 100644
--- a/src/memsweep.c
+++ b/src/memsweep.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Implementation of sweeper module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -60,11 +60,11 @@ static uint64_t  memoryFraction = 80ULL;
 static void*
 allocateOnNode(size_t size, int domainId)
 {
-	char *ptr; 
+    char *ptr; 
 
-	ptr = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+    ptr = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
 
-	if (ptr == (char *)-1)
+    if (ptr == (char *)-1)
     {
         ERROR;
     }
@@ -101,11 +101,14 @@ findProcessor(uint32_t nodeId, uint32_t coreId)
 }
 
 /* evict all dirty cachelines from last level cache */
-static void cleanupCache(char* ptr)
+static void cleanupCache(FILE* OUTSTREAM, char* ptr)
 {
 #ifdef __x86_64
     uint32_t cachesize = 2 * cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].size;
-    printf("Cleanup LLC using %u MB\n", cachesize / (1000000));
+    if (OUTSTREAM != NULL)
+    {
+        fprintf(OUTSTREAM, "Cleanup LLC using %u MB\n", cachesize / (1000000));
+    }
     _loadData(cachesize,ptr);
 #else
     ERROR_PLAIN_PRINT(Cleanup cache is currently only available on 64bit X86 systems.);
@@ -122,32 +125,35 @@ memsweep_setMemoryFraction(uint64_t fraction)
 
 
 void
-memsweep_node(void)
+memsweep_node(FILE* OUTSTREAM)
 {
     for ( uint32_t i=0; i < numa_info.numberOfNodes; i++)
     {
-        memsweep_domain(i);
+        memsweep_domain(OUTSTREAM, i);
     }
 }
 
 
 void
-memsweep_domain(int domainId)
+memsweep_domain(FILE* OUTSTREAM, int domainId)
 {
     char* ptr = NULL;
     size_t size = numa_info.nodes[domainId].totalMemory * 1024ULL * memoryFraction / 100ULL;
-    printf("Sweeping domain %d: Using %g MB of %g MB\n",
-            domainId,
-            size / (1000.0 * 1000.0),
-            numa_info.nodes[domainId].totalMemory/ 1000.0);
+    if (OUTSTREAM != NULL)
+    {
+        fprintf(OUTSTREAM, "Sweeping domain %d: Using %g MB of %g MB\n",
+                domainId,
+                size / (1000.0 * 1000.0),
+                numa_info.nodes[domainId].totalMemory/ 1000.0);
+    }
     ptr = (char*) allocateOnNode(size, domainId);
     initMemory(size, ptr, domainId);
-    cleanupCache(ptr);
+    cleanupCache(OUTSTREAM, ptr);
     munmap(ptr, size);
 }
 
 void
-memsweep_threadGroup(int* processorList, int numberOfProcessors)
+memsweep_threadGroup(FILE* OUTSTREAM, int* processorList, int numberOfProcessors)
 {
     for (uint32_t i=0; i<numa_info.numberOfNodes; i++)
     {
@@ -155,7 +161,7 @@ memsweep_threadGroup(int* processorList, int numberOfProcessors)
         {
             if (findProcessor(i,processorList[j]))
             {
-                memsweep_domain(i);
+                memsweep_domain(OUTSTREAM, i);
                 break;
             }
         }
diff --git a/src/msr.c b/src/msr.c
index 448185b..cb867f2 100644
--- a/src/msr.c
+++ b/src/msr.c
@@ -9,8 +9,8 @@
  *                   sys interface of the Linux 2.6 kernel. This module 
  *                   is based on the msr-util tools.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -45,12 +45,14 @@
 #include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/un.h>
+#include <sys/wait.h>
 
 #include <types.h>
 #include <error.h>
 #include <cpuid.h>
 #include <accessClient.h>
 #include <msr.h>
+#include <registers.h>
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 #define MAX_LENGTH_MSR_DEV_NAME  20
@@ -60,9 +62,61 @@
 /* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
 static int FD[MAX_NUM_THREADS];
 static int socket_fd = -1;
+static int rdpmc_works = 0;
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+static inline int __rdpmc(int counter, uint64_t* value)
+{
+    unsigned low, high;
+    __asm__ volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
+    *value = ((low) | ((uint64_t )(high) << 32));
+    return 0;
+}
+//Needed for rdpmc check
+void segfault_sigaction(int signal, siginfo_t *si, void *arg)
+{
+    exit(1);
+}
 
+int test_rdpmc(int flag)
+{
+    int ret, waiting;
+    int pid;
+    int status = 0;
+    uint64_t tmp;
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(struct sigaction));
+    sigemptyset(&sa.sa_mask);
+    sa.sa_sigaction = segfault_sigaction;
+    sa.sa_flags   = SA_SIGINFO;
+
+    pid = fork();
+
+    if (pid < 0)
+    {
+        return -1;
+    }
+    if (!pid)
+    {
+        sigaction(SIGSEGV, &sa, NULL);
+        if (flag == 0)
+        {
+            __rdpmc(0, &tmp);
+        }
+        exit(0);
+    } else {
+    
+        waiting = waitpid(pid, &status, 0);
+        if (waiting < 0 || status)
+        {
+            ret = 0;
+        } else 
+        {
+            ret = 1;
+        }
+    }
+    return ret;
+}
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
@@ -72,55 +126,42 @@ msr_init(int initSocket_fd)
 {
     if (accessClient_mode == DAEMON_AM_DIRECT)
     {
+        char* msr_file_name = (char*) malloc(MAX_LENGTH_MSR_DEV_NAME * sizeof(char));
 
-        int  fd;
-#ifdef __MIC
-        char* msr_file_name = "/dev/msr0";
-		if( access( msr_file_name, F_OK ) == -1 )
-		{
-			msr_file_name = "/dev/cpu/0/msr";
-		}
-#else 
-        char* msr_file_name = "/dev/cpu/0/msr";
-#endif 
-
-        fd = open(msr_file_name, O_RDWR);
+        sprintf(msr_file_name,"/dev/msr0");
+        if( access( msr_file_name, F_OK ) == -1 )
+        {
+            sprintf(msr_file_name,"/dev/cpu/0/msr");
+        }
 
-        if (fd < 0)
+        if (access(msr_file_name, R_OK|W_OK))
         {
-            fprintf(stderr, "ERROR\n");
-            fprintf(stderr, "rdmsr: failed to open '%s': %s!\n",
-                    msr_file_name , strerror(errno));
-            fprintf(stderr, "       Please check if the msr module \
-                    is loaded and the device file has correct permissions.\n");
-            fprintf(stderr, "       Alternatively you might want to \
-                    look into (sys)daemonmode.\n\n");
+            ERROR_PRINT(Cannot access MSR device file %s: %s.\n
+                        Please check if 'msr' module is loaded and device files have correct permissions\n
+                        Alternatively you might want to look into (sys)daemonmode\n,msr_file_name , strerror(errno));
+            free(msr_file_name);
             exit(127);
         }
-
-        close(fd);
+        rdpmc_works = test_rdpmc(0);
 
         /* NOTICE: This assumes consecutive processor Ids! */
         for ( uint32_t i=0; i < cpuid_topology.numHWThreads; i++ )
         {
-            char* msr_file_name = (char*) malloc(MAX_LENGTH_MSR_DEV_NAME * sizeof(char));
-#ifdef __MIC
-			sprintf(msr_file_name,"/dev/msr%d",i);
-			if( access( msr_file_name, F_OK ) == -1 )
-			{
-				sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
-			}
-#else
-            sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
-#endif
-
+            sprintf(msr_file_name,"/dev/msr%d",i);
+            if( access( msr_file_name, F_OK ) == -1 )
+            {
+                sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
+            }
             FD[i] = open(msr_file_name, O_RDWR);
-
             if ( FD[i] < 0 )
             {
+                ERROR_PRINT(Cannot access MSR device file %s: %s\n,
+                                msr_file_name , strerror(errno));
+                free(msr_file_name);
                 ERROR;
             }
         }
+        free(msr_file_name);
     }
     else
     {
@@ -137,6 +178,7 @@ msr_finalize(void)
         {
             close(FD[i]);
         }
+        rdpmc_works = 0;
     }
     else
     {
@@ -152,9 +194,29 @@ msr_tread(const int tsocket_fd, const int cpu, uint32_t reg)
     {
         uint64_t data;
 
-        if ( pread(FD[cpu], &data, sizeof(data), reg) != sizeof(data) )
+        if (rdpmc_works && reg >= MSR_PMC0 && reg <=MSR_PMC3)
+        {
+            if (__rdpmc(reg - MSR_PMC0, &data) )
+            {
+                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
+                        reg,cpu);
+            }
+        }
+        else if (rdpmc_works && reg >= MSR_PERF_FIXED_CTR0 && reg <= MSR_PERF_FIXED_CTR2)
         {
-            ERROR_PRINT("cpu %d reg %x",cpu, reg);
+            if (__rdpmc(0x4000000ULL + (reg - MSR_PERF_FIXED_CTR0), &data) )
+            {
+                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
+                        reg,cpu);
+            }
+        }
+        else
+        {
+            if ( pread(FD[cpu], &data, sizeof(data), reg) != sizeof(data) )
+            {
+                ERROR_PRINT(Cannot read MSR reg 0x%x with RDMSR instruction on CPU %d\n,
+                        reg, cpu);
+            }
         }
 
         return data;
@@ -173,7 +235,8 @@ msr_twrite(const int tsocket_fd, const int cpu, uint32_t reg, uint64_t data)
     {
         if (pwrite(FD[cpu], &data, sizeof(data), reg) != sizeof(data))
         {
-            ERROR_PRINT("cpu %d reg %x",cpu, reg);
+            ERROR_PRINT(Cannot write MSR reg 0x%x with WRMSR instruction on CPU %d\n,
+                        reg, cpu);
         }
     }
     else
@@ -190,9 +253,29 @@ msr_read( const int cpu, uint32_t reg)
     {
         uint64_t data;
 
-        if ( pread(FD[cpu], &data, sizeof(data), reg) != sizeof(data) )
+        if (rdpmc_works && reg >= MSR_PMC0 && reg <=MSR_PMC3)
         {
-            ERROR_PRINT("cpu %d reg %x",cpu, reg);
+            if (__rdpmc(reg - MSR_PMC0, &data) )
+            {
+                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
+                        reg,cpu);
+            }
+        }
+        else if (rdpmc_works && reg >= MSR_PERF_FIXED_CTR0 && reg <= MSR_PERF_FIXED_CTR2)
+        {
+            if (__rdpmc(0x4000000ULL + (reg - MSR_PERF_FIXED_CTR0), &data) )
+            {
+                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
+                        reg,cpu);
+            }
+        }
+        else
+        {
+            if ( pread(FD[cpu], &data, sizeof(data), reg) != sizeof(data) )
+            {
+                ERROR_PRINT(Cannot read MSR reg 0x%x with RDMSR instruction on CPU %d\n,
+                        reg, cpu);
+            }
         }
 
         return data;
@@ -211,7 +294,8 @@ msr_write( const int cpu, uint32_t reg, uint64_t data)
     {
         if (pwrite(FD[cpu], &data, sizeof(data), reg) != sizeof(data))
         {
-            ERROR_PRINT("cpu %d reg %x",cpu, reg);
+            ERROR_PRINT(Cannot write MSR reg 0x%x with WRMSR instruction on CPU %d\n,
+                        reg, cpu);
         }
     }
     else
diff --git a/src/multiplex.c b/src/multiplex.c
index b3d927d..68a6b88 100644
--- a/src/multiplex.c
+++ b/src/multiplex.c
@@ -5,8 +5,8 @@
  *
  *      Description:  
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/numa.c b/src/numa.c
index c3a52cd..2f72765 100644
--- a/src/numa.c
+++ b/src/numa.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Implementation of Linux NUMA interface
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -69,18 +69,18 @@ static int maxIdConfiguredNode = 0;
 static void
 setConfiguredNodes(void)
 {
-	DIR *dir;
-	struct dirent *de;
+    DIR *dir;
+    struct dirent *de;
 
-	dir = opendir("/sys/devices/system/node");
+    dir = opendir("/sys/devices/system/node");
 
-	if (!dir) 
+    if (!dir) 
     {
-		maxIdConfiguredNode = 0;
-	}
+        maxIdConfiguredNode = 0;
+    }
     else
     {
-		while ((de = readdir(dir)) != NULL) 
+        while ((de = readdir(dir)) != NULL) 
         {
             int nd;
             if (strncmp(de->d_name, "node", 4))
@@ -95,25 +95,25 @@ setConfiguredNodes(void)
                 maxIdConfiguredNode = nd;
             }
         }
-		closedir(dir);
-	}
+        closedir(dir);
+    }
 }
 
 
 static void
 nodeMeminfo(int node, uint64_t* totalMemory, uint64_t* freeMemory)
 {
-	FILE *fp;
+    FILE *fp;
     bstring filename;
     bstring totalString = bformat("MemTotal:");
     bstring freeString  = bformat("MemFree:");
     int i;
 
-	filename = bformat("/sys/devices/system/node/node%d/meminfo", node);
+    filename = bformat("/sys/devices/system/node/node%d/meminfo", node);
 
-	if (NULL != (fp = fopen (bdata(filename), "r"))) 
-	{
-		bstring src = bread ((bNread) fread, fp);
+    if (NULL != (fp = fopen (bdata(filename), "r"))) 
+    {
+        bstring src = bread ((bNread) fread, fp);
         struct bstrList* tokens = bsplit(src,(char) '\n');
 
         for (i=0;i<tokens->qty;i++)
@@ -133,13 +133,13 @@ nodeMeminfo(int node, uint64_t* totalMemory, uint64_t* freeMemory)
                  *freeMemory = str2int(bdata(subtokens->entry[0]));
             }
         }
-	}
+    }
     else
     {
         ERROR;
     }
 
-	fclose(fp);
+    fclose(fp);
 }
 
 static int
@@ -211,7 +211,6 @@ nodeProcessorList(int node, uint32_t** list)
 
         /* FIXME: CPU list here is not physical cores first but numerical sorted */
 
-
         return count;
     }
 
@@ -303,6 +302,7 @@ numa_init()
     for (i=0; i<numa_info.numberOfNodes; i++)
     {
         nodeMeminfo(i, &numa_info.nodes[i].totalMemory, &numa_info.nodes[i].freeMemory);
+        numa_info.nodes[i].id = i;
         numa_info.nodes[i].numberOfProcessors = nodeProcessorList(i,&numa_info.nodes[i].processors);
         numa_info.nodes[i].numberOfDistances = nodeDistanceList(i, numa_info.numberOfNodes, &numa_info.nodes[i].distances);
     }
diff --git a/src/pci.c b/src/pci.c
index 01d0887..2e8a22f 100644
--- a/src/pci.c
+++ b/src/pci.c
@@ -8,8 +8,8 @@
  *                   performance monitoring registers in PCI Cfg space
  *                   for Intel Sandy Bridge Processors.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -156,52 +156,51 @@ pci_init(int initSocket_fd)
         return;
     }
 
-	socket_count = cntr;
+    socket_count = cntr;
 
-	bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
-	bcatcstr(filepath, socket_bus[0]);
-	bcatcstr(filepath, pci_DevicePath[0] );
+    bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
+    bcatcstr(filepath, socket_bus[0]);
+    bcatcstr(filepath, pci_DevicePath[0] );
 
 
     if (access(bdata(filepath),F_OK))
     {
         fprintf(stderr, "INFO\n");
-        fprintf(stderr, "       This system has no support for PCI based Uncore counters.\n");
-        fprintf(stderr, "       This means you cannot use performance groups as MEM, which require Uncore counters.\n\n");
+        fprintf(stderr, "This system has no support for PCI based Uncore counters.\n");
+        fprintf(stderr, "This means you cannot use performance groups as MEM, which require Uncore counters.\n\n");
         return;
     }
     bdestroy(filepath);
 
-    if (accessClient_mode == DAEMON_AM_DIRECT)
+    for (int j=0; j<socket_count; j++)
     {
-        if(geteuid() != 0)
+        for (int i=0; i<MAX_NUM_DEVICES; i++)
         {
-            fprintf(stderr, "WARNING\n");
-            fprintf(stderr, "       Direct access to the PCI Cfg Adressspace is only allowed for uid root!\n");
-            fprintf(stderr, "       This means you can use performance groups as MEM only as root in direct mode.\n");
-            fprintf(stderr, "       Alternatively you might want to look into (sys)daemonmode.\n\n");
-        }
 
-        for (int j=0; j<socket_count; j++)
-        {
-            for (int i=0; i<MAX_NUM_DEVICES; i++)
-            {
+            bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
+            bcatcstr(filepath, socket_bus[j]);
+            bcatcstr(filepath, pci_DevicePath[i] );
 
-                bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
-				bcatcstr(filepath, socket_bus[j]);
-				bcatcstr(filepath, pci_DevicePath[i] );
-				
-				if (!access(bdata(filepath),R_OK|W_OK))
-				{
-					FD[j][i] = 0;
-				}
-				else
-				{
-					//fprintf(stderr, "Device %s not found, excluded it from device list\n",bdata(filepath));
-					FD[j][i] = -2;
-				}
-				bdestroy(filepath);
+            if (!access(bdata(filepath),F_OK))
+            {
+                FD[j][i] = 0;
             }
+            else
+            {
+                FD[j][i] = -2;
+            }
+            bdestroy(filepath);
+        }
+    }
+
+    if (accessClient_mode == DAEMON_AM_DIRECT)
+    {
+        if(geteuid() != 0)
+        {
+            fprintf(stderr, "WARNING\n");
+            fprintf(stderr, "Direct access to the PCI Cfg Adressspace is only allowed for uid root!\n");
+            fprintf(stderr, "This means you can use performance groups as MEM only as root in direct mode.\n");
+            fprintf(stderr, "Alternatively you might want to look into (sys)daemonmode.\n\n");
         }
     }
     else /* daemon or sysdaemon-mode */
@@ -214,20 +213,18 @@ pci_init(int initSocket_fd)
 void
 pci_finalize()
 {
-    if (accessClient_mode == DAEMON_AM_DIRECT)
+    for (int j=0; j<socket_count; j++)
     {
-        for (int j=0; j<socket_count; j++)
+        for (int i=0; i<MAX_NUM_DEVICES; i++)
         {
-            for (int i=0; i<MAX_NUM_DEVICES; i++)
+            if (FD[j][i] > 0)
             {
-                if (FD[j][i] > 0)
-                {
-                    close(FD[j][i]);
-                }
+                close(FD[j][i]);
             }
         }
     }
-    else
+
+    if (accessClient_mode != DAEMON_AM_DIRECT)
     {
         socket_fd = -1;
     }
@@ -238,16 +235,16 @@ uint32_t
 pci_read(int cpu, PciDeviceIndex device, uint32_t reg)
 {
     int socketId = affinity_core2node_lookup[cpu];
+    if ( FD[socketId][device] == -2)
+    {
+        fprintf(stderr, "Trying to access non-existent PCI device (%s) for reading\n", pci_DevicePath[device]);
+        return 0;
+    }
 
     if (accessClient_mode == DAEMON_AM_DIRECT)
     {
         uint32_t data = 0;
-		if ( FD[socketId][device] == -2)
-		{
-			fprintf(stderr, "Accessing non-existent device %s%s%s\n",PCI_ROOT_PATH,socket_bus[socketId],pci_DevicePath[device]);
-			return data;
-		}
-        else if ( !FD[socketId][device] )
+        if ( !FD[socketId][device] )
         {
             bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
             bcatcstr(filepath, socket_bus[socketId]);
@@ -256,17 +253,16 @@ pci_read(int cpu, PciDeviceIndex device, uint32_t reg)
 
             if ( FD[socketId][device] < 0)
             {
-                fprintf(stderr, "ERROR in pci_read:\n    failed to open pci device %s: %s!\n",
+                fprintf(stderr, "ERROR in pci_read: failed to open pci device %s: %s!\n",
                         bdata(filepath), strerror(errno));
-                // exit(127);
             }
             bdestroy(filepath);
         }
 
         if ( FD[socketId][device] > 0 &&
-             pread(FD[socketId][device], &data, sizeof data, reg) != sizeof data ) 
+             pread(FD[socketId][device], &data, sizeof data, reg) != sizeof data )
         {
-            ERROR_PRINT("cpu %d reg %x",cpu, reg);
+            ERROR_PRINT("ERROR in pci_read: failed on CPU %d Register 0x%x", cpu, reg);
         }
 
         return data;
@@ -284,14 +280,14 @@ pci_write(int cpu, PciDeviceIndex device, uint32_t reg, uint32_t data)
 {
     int socketId = affinity_core2node_lookup[cpu];
 
+    if ( FD[socketId][device] == -2)
+    {
+        fprintf(stderr, "Trying to access non-existent PCI device (%s) for writing\n", pci_DevicePath[device]);
+        return;
+    }
     if (accessClient_mode == DAEMON_AM_DIRECT)
     {
-		if ( FD[socketId][device] == -2)
-		{
-			fprintf(stderr, "Accessing non-existent device %s%s%s\n",PCI_ROOT_PATH,socket_bus[socketId],pci_DevicePath[device]);
-			return;
-		}
-        else if ( !FD[socketId][device] )
+        if ( !FD[socketId][device] )
         {
             bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
             bcatcstr(filepath, socket_bus[socketId]);
@@ -300,20 +296,17 @@ pci_write(int cpu, PciDeviceIndex device, uint32_t reg, uint32_t data)
 
             if ( FD[socketId][device] < 0)
             {
-                fprintf(stderr, "ERROR in pci_write:\n    failed to open pci device %s: %s!\n",
+                fprintf(stderr, "ERROR in pci_write: failed to open pci device %s: %s!\n",
                         bdata(filepath), strerror(errno));
-                // exit(127);
             }
             bdestroy(filepath);
         }
 
         if ( FD[socketId][device] > 0 &&
-             pwrite(FD[socketId][device], &data, sizeof data, reg) != sizeof data) 
+             pwrite(FD[socketId][device], &data, sizeof data, reg) != sizeof data)
         {
-            ERROR_PRINT("cpu %d reg %x",cpu, reg);
+            ERROR_PRINT("ERROR in pci_write: failed on CPU %d Register 0x%x", cpu, reg);
         }
-
-        //    printf("WRITE Device %s cpu %d reg 0x%x data 0x%x \n",bdata(filepath), cpu, reg, data);
     }
     else
     { /* daemon or sysdaemon-mode */
@@ -325,21 +318,19 @@ uint32_t
 pci_tread(const int tsocket_fd, const int cpu, PciDeviceIndex device, uint32_t reg)
 {
     int socketId = affinity_core2node_lookup[cpu];
+    if ( FD[socketId][device] == -2)
+    {
+        return 0;
+    }
 
     if (accessClient_mode == DAEMON_AM_DIRECT)
     {
         uint32_t data = 0;
-		if ( FD[socketId][device] == -2)
-		{
-			fprintf(stderr, "Accessing non-existent device %s%s%s\n",PCI_ROOT_PATH,socket_bus[socketId],pci_DevicePath[device]);
-			return data;
-		}
-        else if ( !FD[socketId][device] )
+        if ( !FD[socketId][device] )
         {
             bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
             bcatcstr(filepath, socket_bus[socketId]);
             bcatcstr(filepath, pci_DevicePath[device] );
-            //        printf("Generate PATH = %s \n",bdata(filepath));
 
             FD[socketId][device] = open( bdata(filepath), O_RDWR);
 
@@ -347,17 +338,15 @@ pci_tread(const int tsocket_fd, const int cpu, PciDeviceIndex device, uint32_t r
             {
                 fprintf(stderr, "ERROR in pci_tread:\n    failed to open pci device %s: %s!\n",
                         bdata(filepath), strerror(errno));
-                // exit(127);
             }
             bdestroy(filepath);
         }
 
         if ( FD[socketId][device] > 0 &&
-             pread(FD[socketId][device], &data, sizeof data, reg) != sizeof data ) 
+             pread(FD[socketId][device], &data, sizeof data, reg) != sizeof data )
         {
-            ERROR_PRINT("cpu %d reg %x",cpu, reg);
+            ERROR_PRINT("ERROR in pci_tread: failed on CPU %d Register 0x%x", cpu, reg);
         }
-        //    printf("READ Device %s cpu %d reg 0x%x data 0x%x \n",bdata(filepath), cpu, reg, data);
 
         return data;
     }
@@ -371,39 +360,33 @@ void
 pci_twrite( const int tsocket_fd, const int cpu, PciDeviceIndex device, uint32_t reg, uint32_t data)
 {
     int socketId = affinity_core2node_lookup[cpu];
-	
+    if ( FD[socketId][device] == -2)
+    {
+        return;
+    }
     if (accessClient_mode == DAEMON_AM_DIRECT)
     {
-		if ( FD[socketId][device] == -2)
-		{
-			fprintf(stderr, "Accessing non-existent device %s%s%s\n",PCI_ROOT_PATH,socket_bus[socketId],pci_DevicePath[device]);
-			return;
-		}
-        else if ( !FD[socketId][device] )
+        if ( !FD[socketId][device] )
         {
             bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
             bcatcstr(filepath, socket_bus[socketId]);
             bcatcstr(filepath, pci_DevicePath[device] );
-            //        printf("Generate PATH = %s \n",bdata(filepath));
 
             FD[socketId][device] = open( bdata(filepath), O_RDWR);
 
             if ( FD[socketId][device] < 0)
             {
-                fprintf(stderr, "ERROR in pci_twrite:\n    failed to open pci device %s: %s!\n",
+                fprintf(stderr, "ERROR in pci_twrite: failed to open pci device %s: %s!\n",
                         bdata(filepath), strerror(errno));
-                //exit(127);
             }
             bdestroy(filepath);
         }
 
         if ( FD[socketId][device] > 0 &&
-             pwrite(FD[socketId][device], &data, sizeof data, reg) != sizeof data) 
+             pwrite(FD[socketId][device], &data, sizeof data, reg) != sizeof data)
         {
-            ERROR_PRINT("cpu %d reg %x",cpu, reg);
+            ERROR_PRINT("ERROR in pci_twrite: failed on CPU %d Register 0x%x", cpu, reg);
         }
-
-        //    printf("WRITE Device %s cpu %d reg 0x%x data 0x%x \n",bdata(filepath), cpu, reg, data);
     }
     else
     { /* daemon or sysdaemon-mode */
diff --git a/src/perfmon.c b/src/perfmon.c
index 6d1630f..30cacba 100644
--- a/src/perfmon.c
+++ b/src/perfmon.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Implementation of perfmon Module.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -37,6 +37,7 @@
 #include <float.h>
 #include <unistd.h>
 #include <sys/types.h>
+#include <assert.h>
 
 #include <types.h>
 #include <bitUtil.h>
@@ -71,6 +72,7 @@ static PerfmonEvent* eventHash;
 static PerfmonCounterMap* counter_map;
 static PerfmonGroupMap* group_map;
 static PerfmonGroupHelp* group_help;
+static EventSetup * eventSetup;
 
 static TimerData timeData;
 static double rdtscTime;
@@ -123,7 +125,7 @@ static void initThread(int , int );
     for (i=0; i<numRows; i++) \
     { \
         fc->entry[1+i] = \
-           bfromcstr(perfmon_set.events[i].event.name); } 
+           bfromcstr(perfmon_set.events[i].event.name); }
 
 #define INIT_BASIC  \
     fc = bstrListCreate(); \
@@ -145,6 +147,7 @@ static void initThread(int , int );
 #include <perfmon_k10.h>
 #include <perfmon_interlagos.h>
 #include <perfmon_kabini.h>
+#include <perfmon_silvermont.h>
 
 /* #####  EXPORTED  FUNCTION POINTERS   ################################### */
 void (*perfmon_startCountersThread) (int thread_id);
@@ -154,6 +157,8 @@ void (*perfmon_setupCounterThread) (int thread_id,
         PerfmonEvent* event, PerfmonCounterIndex index);
 void (*printDerivedMetrics) (PerfmonGroup group);
 void (*logDerivedMetrics) (PerfmonGroup group, double time, double timeStamp);
+void (*perfmon_getDerivedCounterValuesArch)(PerfmonGroup group, float * values, float * out_max, float * out_min);
+
 
 /* #####   FUNCTION POINTERS  -  LOCAL TO THIS SOURCE FILE ################ */
 
@@ -163,18 +168,39 @@ static void (*initThreadArch) (PerfmonThread *thread);
 
 static int getIndex (bstring reg, PerfmonCounterIndex* index)
 {
+    int ret = FALSE;
+    int err = 0;
+    uint64_t tmp;
     for (int i=0; i< perfmon_numCounters; i++)
     {
         if (biseqcstr(reg, counter_map[i].key))
         {
             *index = counter_map[i].index;
-            return TRUE;
+            ret = TRUE;
+        }
+    }
+    if ((ret) && (counter_map[*index].type != THERMAL) && (counter_map[*index].type != POWER))
+    {
+        if (counter_map[*index].device == 0)
+        {
+            tmp = msr_read(0, counter_map[*index].configRegister);
+            msr_write(0, counter_map[*index].configRegister,0x0ULL);
         }
+        else
+        {
+            tmp = pci_read(0, counter_map[*index].device, counter_map[*index].configRegister);
+            pci_write(0, counter_map[*index].device, counter_map[*index].configRegister, 0x0U);
+        }
+    }
+    else if ((ret) && (counter_map[*index].type == POWER))
+    {
+        tmp = msr_read(0, counter_map[*index].counterRegister);
     }
 
-    return FALSE;
+    return ret;
 }
 
+
 static int
 getEvent(bstring event_str, PerfmonEvent* event)
 {
@@ -214,9 +240,9 @@ initThread(int thread_id, int cpu_id)
 }
 
 struct cbsScan{
-	/* Parse state */
-	bstring src;
-	int line;
+    /* Parse state */
+    bstring src;
+    int line;
     LikwidResults* results;
 };
 
@@ -242,7 +268,8 @@ static int lineCb (void* parm, int ofs, int len)
             ERROR_PLAIN_PRINT(Failed to read marker file);
         }
         ret = sscanf (bdata(strList->entry[0]), "%d", &id); CHECKERROR;
-        st->results[id].tag = bstrcpy(strList->entry[1]);
+        st->results[id].tag = bstrcpy(line);
+	 bdelete(st->results[id].tag, 0, blength(strList->entry[0])+1);
     }
     else
     {
@@ -291,14 +318,15 @@ readMarkerFile(bstring filename, LikwidResults** resultsRef)
         ret = sscanf (bdata(src), "%d %d", &numberOfThreads, &perfmon_numRegions); CHECKERROR;
         results = (LikwidResults*) malloc(perfmon_numRegions * sizeof(LikwidResults));
 
-        if (numberOfThreads != perfmon_numThreads)
+        if (perfmon_numRegions == 0)
         {
-            fprintf(OUTSTREAM,"ERROR: \
-                Is the number of threads for likwid-perfctr equal \
-                to the number in the measured application?\n");
-
-            fprintf(OUTSTREAM,"likwid_markerInit and likwid_markerClose \
-                must be called in serial region.\n");
+            fprintf(OUTSTREAM,"ERROR: No region results are listed in marker file\n");
+            ERROR_PLAIN_PRINT(No region results in marker file);
+        }
+        else if (numberOfThreads != perfmon_numThreads)
+        {
+            fprintf(OUTSTREAM,"ERROR: Is the number of threads for likwid-perfctr equal to the number in the measured application?\n");
+            fprintf(OUTSTREAM,"likwid_markerInit and likwid_markerClose must be called in serial region.\n");
 
             ERROR_PRINT(Number of threads %d in marker file unequal to number of threads in likwid-perfCtr %d,numberOfThreads,perfmon_numThreads);
         }
@@ -352,22 +380,22 @@ readMarkerFile(bstring filename, LikwidResults** resultsRef)
 static void
 printResultTable(PerfmonResultTable * tableData)
 {
-    if (perfmon_csvoutput) 
+    if (perfmon_csvoutput)
     {
         int r, c;
-        for (c = 0; c < tableData->header->qty; c++) 
+        for (c = 0; c < tableData->header->qty; c++)
         {
             fprintf(OUTSTREAM, "%s%s", ((c == 0) ? "\n" : ","), tableData->header->entry[c]->data);
         }
         fprintf(OUTSTREAM, "%s", "\n");
 
-        for (r = 0; r < tableData->numRows; r++) 
+        for (r = 0; r < tableData->numRows; r++)
         {
             fprintf(OUTSTREAM, "%s", tableData->rows[r].label->data);
 
-            for (c = 0; c < tableData->numColumns; c++) 
+            for (c = 0; c < tableData->numColumns; c++)
             {
-                if (!isnan(tableData->rows[r].value[c])) 
+                if (!isnan(tableData->rows[r].value[c]))
                 {
                     fprintf(OUTSTREAM, ",%lf", tableData->rows[r].value[c]);
                 }
@@ -424,7 +452,7 @@ getGroupId(bstring groupStr,PerfmonGroup* group)
 
     for (int i=0; i<perfmon_numGroups; i++)
     {
-        if (biseqcstr(groupStr,group_map[i].key)) 
+        if (biseqcstr(groupStr,group_map[i].key))
         {
             *group = group_map[i].index;
             return i;
@@ -478,7 +506,7 @@ freeResultTable(PerfmonResultTable* tableData)
     free(tableData->rows);
 }
 
-static void 
+static void
 initResultTable(PerfmonResultTable* tableData,
         bstrList* firstColumn,
         int numRows,
@@ -505,17 +533,13 @@ initResultTable(PerfmonResultTable* tableData,
 
     for (i=0; i<numRows; i++)
     {
-//        tableData->rows[i].label =
-//           bfromcstr(perfmon_set.events[i].event.name);
-
         tableData->rows[i].label = firstColumn->entry[1+i];
-
         tableData->rows[i].value =
             (double*) malloc((numColumns)*sizeof(double));
     }
 }
 
-static void 
+static void
 initStatisticTable(PerfmonResultTable* tableData,
         bstrList* firstColumn,
         int numRows)
@@ -545,12 +569,8 @@ initStatisticTable(PerfmonResultTable* tableData,
 
     for (i=0; i<numRows; i++)
     {
-//        tableData->rows[i].label =
-//           bfromcstr(perfmon_set.events[i].event.name);
-
         tableData->rows[i].label = firstColumn->entry[1+i];
         bcatcstr(tableData->rows[i].label," STAT");
-
         tableData->rows[i].value =
             (double*) malloc((numColumns)*sizeof(double));
     }
@@ -802,7 +822,7 @@ perfmon_printMarkerResults(bstring filepath)
     bstrListDestroy(regionLabels);
 }
 
-void 
+void
 perfmon_logCounterResults(double time)
 {
     int i;
@@ -818,11 +838,8 @@ perfmon_logCounterResults(double time)
         for (j=0; j<perfmon_numThreads; j++)
         {
             fprintf(OUTSTREAM, "%e ",
-                    (double) (perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData) - perfmon_threadState[j][i]);
-            tmp =perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData;
-            perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData -=
-              perfmon_threadState[j][i];
-            perfmon_threadState[j][i] = tmp;
+                    (double) (perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData) - perfmon_threadState[j][perfmon_set.events[i].index]);
+            perfmon_threadState[j][perfmon_set.events[i].index] = perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData;
         }
         fprintf(OUTSTREAM,"\n");
     }
@@ -835,7 +852,7 @@ perfmon_logCounterResults(double time)
     fflush(OUTSTREAM);
 }
 
-void 
+void
 perfmon_printCounterResults()
 {
     int i;
@@ -911,6 +928,79 @@ perfmon_getEventResult(int thread, int index)
     return (double) perfmon_threadData[thread].counters[perfmon_set.events[index].index].counterData;
 }
 
+EventSetup perfmon_prepareEventSetup(char* eventGroupString){
+     EventSetup setup;
+     bstring eventString = bfromcstr(eventGroupString);
+
+     setup.eventSetConfig = malloc(sizeof(setup.eventSetConfig));
+     setup.perfmon_set = malloc(sizeof(setup.perfmon_set));
+
+     int groupId = getGroupId(eventString, & setup.groupSet);
+     setup.groupName = strdup(eventGroupString);
+     setup.groupIndex = groupId;
+     if (setup.groupSet == _NOGROUP)
+     {
+        /* eventString is a custom eventSet */
+        bstr_to_eventset(setup.eventSetConfig, eventString);
+     }
+     else
+     {
+        /* eventString is a group */
+        eventString = bfromcstr(group_map[groupId].config);
+        bstr_to_eventset(setup.eventSetConfig, eventString);
+     }
+
+     perfmon_initEventSet(setup.eventSetConfig, setup.perfmon_set);
+     bdestroy(eventString);
+
+     setup.eventNames = (const char**) malloc(setup.perfmon_set->numberOfEvents * sizeof(const char*));
+
+     setup.numberOfEvents = setup.perfmon_set->numberOfEvents;
+     for (int i=0; i< setup.perfmon_set->numberOfEvents; i++)
+     {
+        setup.eventNames[i] = setup.perfmon_set->events[i].event.name;
+     }
+
+     setup.numberOfDerivedCounters = group_map[groupId].derivedCounters;
+     setup.derivedNames = (const char**) malloc(setup.numberOfDerivedCounters * sizeof(const char*));
+
+     for(int i=0; i < group_map[groupId].derivedCounters; i++){
+        setup.derivedNames[i] = group_map[groupId].derivedCounterNames[i];
+     }
+
+     return setup;
+}
+
+
+void perfmon_setupCountersForEventSet(EventSetup * setup){
+    perfmon_set = *setup->perfmon_set;
+    groupSet = setup->groupSet;
+    eventSetup = setup;
+    perfmon_setupCounters();
+}
+
+void perfmon_getEventCounterValues(uint64_t * values, uint64_t * out_max, uint64_t * out_min){
+
+    for(int e = 0; e < perfmon_set.numberOfEvents; e++ ){
+        uint64_t sum = 0;
+        uint64_t min = (uint64_t) -1;
+        uint64_t max = 0;
+
+        for(int i = 0; i < perfmon_numThreads; i++){
+            uint64_t cur = perfmon_threadData[i].counters[e].counterData;
+            sum += cur;
+            max = max > cur ? max : cur;
+            min = min < cur ? min : cur;
+        }
+        values[e] = sum / perfmon_numThreads;
+        out_min[e] = min;
+        out_max[e] = max;
+    }
+}
+
+void perfmon_getDerivedCounterValues(float * values, float * out_max, float * out_min){
+    perfmon_getDerivedCounterValuesArch(eventSetup->groupSet, values, out_max, out_min);
+}
 
 int
 perfmon_setupEventSetC(char* eventCString, const char*** eventnames)
@@ -955,10 +1045,9 @@ perfmon_setupEventSet(bstring eventString, BitMask* counterMask)
     StrUtilEventSet eventSetConfig;
     PerfmonEvent eventSet;
     struct bstrList* subStr;
-    
 
     groupId = getGroupId(eventString, &groupSet);
-    
+
     if (groupSet == _NOGROUP)
     {
         subStr = bstrListCreate();
@@ -966,15 +1055,25 @@ perfmon_setupEventSet(bstring eventString, BitMask* counterMask)
         eventBool = getEvent(subStr->entry[0], &eventSet);
         bstrListDestroy(subStr);
     }
-    
+
     if (groupSet == _NOGROUP && eventBool != FALSE)
     {
         /* eventString is a custom eventSet */
         /* append fixed counters for Intel processors */
         if ( cpuid_info.family == P6_FAMILY )
         {
-            bcatcstr(eventString,
-                    ",INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,CPU_CLK_UNHALTED_REF:FIXC2");
+            if (cpuid_info.perf_num_fixed_ctr > 0)
+            {
+                bcatcstr(eventString,",INSTR_RETIRED_ANY:FIXC0");
+            }
+            if (cpuid_info.perf_num_fixed_ctr > 1)
+            {
+                bcatcstr(eventString,",CPU_CLK_UNHALTED_CORE:FIXC1");
+            }
+            if (cpuid_info.perf_num_fixed_ctr > 2)
+            {
+                bcatcstr(eventString,",CPU_CLK_UNHALTED_REF:FIXC2");
+            }
         }
         bstr_to_eventset(&eventSetConfig, eventString);
     }
@@ -988,7 +1087,9 @@ perfmon_setupEventSet(bstring eventString, BitMask* counterMask)
         if ( group_map[groupId].isUncore )
         {
             if ( (cpuid_info.model != SANDYBRIDGE_EP) &&
-                    (cpuid_info.model != IVYBRIDGE_EP))
+                    (cpuid_info.model != IVYBRIDGE_EP) &&
+                    (cpuid_info.model != WESTMERE_EX) &&
+                    (cpuid_info.model != NEHALEM_EX))
             {
                 ERROR_PLAIN_PRINT(Uncore not supported on Desktop processors!);
                 exit(EXIT_FAILURE);
@@ -1083,7 +1184,9 @@ perfmon_printAvailableGroups()
         if ( group_map[i].isUncore )
         {
             if ( (cpuid_info.model == SANDYBRIDGE_EP) ||
-                    (cpuid_info.model == IVYBRIDGE_EP))
+                 (cpuid_info.model == IVYBRIDGE_EP) ||
+                 (cpuid_info.model == WESTMERE_EX) ||
+                 (cpuid_info.model == NEHALEM_EX))
             {
                 fprintf(OUTSTREAM,"%s: %s\n",group_map[i].key,
                         group_map[i].info);
@@ -1136,12 +1239,16 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
     {
         perfmon_threadState[i] = (double*)
             malloc(NUM_PMC * sizeof(double));
+        for(int j=0; j<NUM_PMC;j++)
+        {
+            perfmon_threadState[i][j] = 0.0;
+        }
     }
 
-    for(int i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT;
-
     OUTSTREAM = outstream;
 
+    for(int i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT;
+
     if (accessClient_mode != DAEMON_AM_DIRECT)
     {
         accessClient_init(&socket_fd);
@@ -1171,6 +1278,7 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
                     initThreadArch = perfmon_init_pm;
                     printDerivedMetrics = perfmon_printDerivedMetrics_pm;
+                    assert(FALSE && "NOT SUPPORTED");
                     perfmon_startCountersThread = perfmon_startCountersThread_pm;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_pm;
                     perfmon_setupCounterThread = perfmon_setupCounterThread_pm;
@@ -1196,11 +1304,35 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
                     initThreadArch = perfmon_init_core2;
                     printDerivedMetrics = perfmon_printDerivedMetricsAtom;
+                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesAtom;
                     perfmon_startCountersThread = perfmon_startCountersThread_core2;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
                     perfmon_setupCounterThread = perfmon_setupCounterThread_core2;
                     break;
 
+                case ATOM_SILVERMONT_C:
+                case ATOM_SILVERMONT_E:
+                case ATOM_SILVERMONT_F1:
+                case ATOM_SILVERMONT_F2:
+                case ATOM_SILVERMONT_F3:
+                    power_init(0);
+                    thermal_init(0);
+                    eventHash = silvermont_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsSilvermont;
+
+                    group_map = silvermont_group_map;
+                    group_help = silvermont_group_help;
+                    perfmon_numGroups = perfmon_numGroupsSilvermont;
+
+                    counter_map = silvermont_counter_map;
+                    perfmon_numCounters = perfmon_numCountersSilvermont;
+
+                    initThreadArch = perfmon_init_silvermont;
+                    printDerivedMetrics = perfmon_printDerivedMetricsSilvermont;
+                    perfmon_startCountersThread = perfmon_startCountersThread_silvermont;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_silvermont;
+                    perfmon_setupCounterThread = perfmon_setupCounterThread_silvermont;
+                    break;
 
                 case CORE_DUO:
                     ERROR_PLAIN_PRINT(Unsupported Processor);
@@ -1224,6 +1356,8 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
                     initThreadArch = perfmon_init_core2;
                     printDerivedMetrics = perfmon_printDerivedMetricsCore2;
+                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesCore2;
+
                     logDerivedMetrics = perfmon_logDerivedMetricsCore2;
                     perfmon_startCountersThread = perfmon_startCountersThread_core2;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
@@ -1243,12 +1377,13 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
                     counter_map = westmereEX_counter_map;
                     perfmon_numCounters = perfmon_numCountersWestmereEX;
 
-                    initThreadArch = perfmon_init_westmereEX;
+                    initThreadArch = perfmon_init_nehalemEX;
                     printDerivedMetrics = perfmon_printDerivedMetricsNehalemEX;
+                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesNehalemEX;
                     logDerivedMetrics = perfmon_logDerivedMetricsNehalemEX;
-                    perfmon_startCountersThread = perfmon_startCountersThread_westmereEX;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_westmereEX;
-                    perfmon_readCountersThread = perfmon_readCountersThread_westmereEX;
+                    perfmon_startCountersThread = perfmon_startCountersThread_nehalemEX;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalemEX;
+                    perfmon_readCountersThread = perfmon_readCountersThread_nehalemEX;
                     perfmon_setupCounterThread = perfmon_setupCounterThread_nehalemEX;
                     break;
 
@@ -1266,6 +1401,7 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
                     initThreadArch = perfmon_init_westmereEX;
                     printDerivedMetrics = perfmon_printDerivedMetricsWestmereEX;
+                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesWestmereEX;
                     logDerivedMetrics = perfmon_logDerivedMetricsWestmereEX;
                     perfmon_startCountersThread = perfmon_startCountersThread_westmereEX;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_westmereEX;
@@ -1291,6 +1427,8 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
                     initThreadArch = perfmon_init_nehalem;
                     printDerivedMetrics = perfmon_printDerivedMetricsNehalem;
+                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesNehalem;
+
                     logDerivedMetrics = perfmon_logDerivedMetricsNehalem;
                     perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
@@ -1316,6 +1454,8 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
                     initThreadArch = perfmon_init_nehalem;
                     printDerivedMetrics = perfmon_printDerivedMetricsWestmere;
+                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesWestmere;
+
                     logDerivedMetrics = perfmon_logDerivedMetricsWestmere;
                     perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
@@ -1329,7 +1469,7 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
                     power_init(0); /* FIXME Static coreId is dangerous */
                     thermal_init(0);
-                    pci_init(socket_fd); 
+                    pci_init(socket_fd);
 
                     eventHash = ivybridge_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsIvybridge;
@@ -1343,6 +1483,8 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
                     initThreadArch = perfmon_init_ivybridge;
                     printDerivedMetrics = perfmon_printDerivedMetricsIvybridge;
+                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesIvybridge;
+
                     logDerivedMetrics = perfmon_logDerivedMetricsIvybridge;
                     perfmon_startCountersThread = perfmon_startCountersThread_ivybridge;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_ivybridge;
@@ -1373,6 +1515,7 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
                     initThreadArch = perfmon_init_haswell;
                     printDerivedMetrics = perfmon_printDerivedMetricsHaswell;
+                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesHaswell;
                     logDerivedMetrics = perfmon_logDerivedMetricsHaswell;
                     perfmon_startCountersThread = perfmon_startCountersThread_haswell;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_haswell;
@@ -1400,6 +1543,7 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
                     initThreadArch = perfmon_init_sandybridge;
                     printDerivedMetrics = perfmon_printDerivedMetricsSandybridge;
+                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesSandybridge;
                     logDerivedMetrics = perfmon_logDerivedMetricsSandybridge;
                     perfmon_startCountersThread = perfmon_startCountersThread_sandybridge;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_sandybridge;
@@ -1431,6 +1575,7 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
                     initThreadArch = perfmon_init_phi;
                     printDerivedMetrics = perfmon_printDerivedMetricsPhi;
+                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesPhi;
                     logDerivedMetrics = perfmon_logDerivedMetricsPhi;
                     perfmon_startCountersThread = perfmon_startCountersThread_phi;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_phi;
@@ -1457,6 +1602,7 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
             initThreadArch = perfmon_init_k10;
             printDerivedMetrics = perfmon_printDerivedMetricsK8;
+            perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesK8;
             logDerivedMetrics = perfmon_logDerivedMetricsK8;
             perfmon_startCountersThread = perfmon_startCountersThread_k10;
             perfmon_stopCountersThread = perfmon_stopCountersThread_k10;
@@ -1477,6 +1623,7 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
             initThreadArch = perfmon_init_k10;
             printDerivedMetrics = perfmon_printDerivedMetricsK10;
+            perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesK10;
             logDerivedMetrics = perfmon_logDerivedMetricsK10;
             perfmon_startCountersThread = perfmon_startCountersThread_k10;
             perfmon_stopCountersThread = perfmon_stopCountersThread_k10;
@@ -1497,6 +1644,7 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
             initThreadArch = perfmon_init_interlagos;
             printDerivedMetrics = perfmon_printDerivedMetricsInterlagos;
+            perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesInterlagos;
             logDerivedMetrics = perfmon_logDerivedMetricsInterlagos;
             perfmon_startCountersThread = perfmon_startCountersThread_interlagos;
             perfmon_stopCountersThread = perfmon_stopCountersThread_interlagos;
@@ -1517,6 +1665,7 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
 
             initThreadArch = perfmon_init_kabini;
             printDerivedMetrics = perfmon_printDerivedMetricsKabini;
+            perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesKabini;
             logDerivedMetrics = perfmon_logDerivedMetricsKabini;
             perfmon_startCountersThread = perfmon_startCountersThread_kabini;
             perfmon_stopCountersThread = perfmon_stopCountersThread_kabini;
diff --git a/src/power.c b/src/power.c
index 8f55cb2..3f4118c 100644
--- a/src/power.c
+++ b/src/power.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Module implementing Intel RAPL interface
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -54,6 +54,7 @@ power_init(int cpuId)
 {
     uint64_t flags;
     int hasRAPL = 0;
+    uint32_t info_register = 0x0;
 
     /* determine Turbo Mode features */
     double busSpeed;
@@ -61,10 +62,24 @@ power_init(int cpuId)
     if ((cpuid_info.model == SANDYBRIDGE_EP) ||
             (cpuid_info.model == SANDYBRIDGE) ||
             (cpuid_info.model == HASWELL) ||
+            (cpuid_info.model == HASWELL_EX) ||
             (cpuid_info.model == IVYBRIDGE_EP) ||
             (cpuid_info.model == IVYBRIDGE))
     {
         hasRAPL = 1;
+        info_register = MSR_PKG_POWER_INFO;
+    }
+    else if (cpuid_info.model == ATOM_SILVERMONT_C)
+    {
+        hasRAPL = 1;
+        info_register = MSR_PKG_POWER_INFO_SILVERMONT;
+    }
+    else if ((cpuid_info.model == ATOM_SILVERMONT_E) ||
+             (cpuid_info.model == ATOM_SILVERMONT_F1) ||
+             (cpuid_info.model == ATOM_SILVERMONT_F2) ||
+             (cpuid_info.model == ATOM_SILVERMONT_F3))
+    {
+        hasRAPL = 1;
     }
 
     if (cpuid_info.turbo)
@@ -114,15 +129,40 @@ power_init(int cpuId)
         power_info.energyUnit = pow(0.5,(double) extractBitField(flags,5,8));
         power_info.timeUnit = pow(0.5,(double) extractBitField(flags,4,16));
 
-        flags = msr_read(cpuId, MSR_PKG_POWER_INFO);
-        power_info.tdp = (double) extractBitField(flags,15,0) * power_info.powerUnit;
-        power_info.minPower =  (double) extractBitField(flags,15,16) * power_info.powerUnit;
-        power_info.maxPower = (double) extractBitField(flags,15,32) * power_info.powerUnit;
-        power_info.maxTimeWindow = (double) extractBitField(flags,7,48) * power_info.timeUnit;
+        if (info_register != 0x0)
+        {
+            flags = msr_read(cpuId, info_register);
+            power_info.tdp = (double) extractBitField(flags,15,0) * power_info.powerUnit;
+            if (cpuid_info.model != ATOM_SILVERMONT_C)
+            {
+                power_info.minPower =  (double) extractBitField(flags,15,16) * power_info.powerUnit;
+                power_info.maxPower = (double) extractBitField(flags,15,32) * power_info.powerUnit;
+                power_info.maxTimeWindow = (double) extractBitField(flags,7,48) * power_info.timeUnit;
+            }
+            else
+            {
+                power_info.minPower = 0.0;
+                power_info.maxPower = 0.0;
+                power_info.maxTimeWindow = 0.0;
+            }
+        }
+        else
+        {
+            power_info.tdp = 0;
+            power_info.minPower = 0.0;
+            power_info.maxPower = 0.0;
+            power_info.maxTimeWindow = 0.0;
+        }
     }
     else
     {
         power_info.powerUnit = 0.0;
+        power_info.energyUnit = 0.0;
+        power_info.timeUnit = 0.0;
+        power_info.tdp = 0;
+        power_info.minPower = 0.0;
+        power_info.maxPower = 0.0;
+        power_info.maxTimeWindow = 0.0;
     }
 }
 
diff --git a/src/pthread-overload/Makefile b/src/pthread-overload/Makefile
index bb61f96..5f460a5 100644
--- a/src/pthread-overload/Makefile
+++ b/src/pthread-overload/Makefile
@@ -4,8 +4,8 @@
 # 
 #      Description:  pthread-overload Makefile
 # 
-#      Version:   3.1.2
-#      Released:  2.6.2014
+#      Version:   3.1.3
+#      Released:  4.11.2014
 # 
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
diff --git a/src/pthread-overload/pthread-overload.c b/src/pthread-overload/pthread-overload.c
index c53c884..e9d5dcc 100644
--- a/src/pthread-overload/pthread-overload.c
+++ b/src/pthread-overload/pthread-overload.c
@@ -3,11 +3,11 @@
  *
  *      Filename:  pthread-overload.c
  *
- *      Description:  Overloaded library for pthread_create call. 
+ *      Description:  Overloaded library for pthread_create call.
  *                    Implements pinning of threads together with likwid-pin.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -171,7 +171,7 @@ pthread_create(pthread_t* thread,
         {
             break;
         }
-        if (sosearchpaths[reallpthrindex] != NULL) 
+        if (sosearchpaths[reallpthrindex] != NULL)
         {
             reallpthrindex++;
         }
diff --git a/src/strUtil.c b/src/strUtil.c
index 91a7083..cf37920 100644
--- a/src/strUtil.c
+++ b/src/strUtil.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Utility routines for strings. Depends on bstring lib.
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -46,161 +46,160 @@
 static int
 cpu_count(cpu_set_t* set)
 {
-  uint32_t i;
-  int s = 0;
-  const __cpu_mask *p = set->__bits;
-  const __cpu_mask *end = &set->__bits[sizeof(cpu_set_t) / sizeof (__cpu_mask)];
+    uint32_t i;
+    int s = 0;
+    const __cpu_mask *p = set->__bits;
+    const __cpu_mask *end = &set->__bits[sizeof(cpu_set_t) / sizeof (__cpu_mask)];
 
-  while (p < end)
-  {
-    __cpu_mask l = *p++;
-
-    if (l == 0)
+    while (p < end)
     {
-      continue;
-    }
+        __cpu_mask l = *p++;
 
-    for (i=0; i< (sizeof(__cpu_mask)*8); i++)
-    {
-      if (l&(1UL<<i))
-      {
-        s++;
-      }
+        if (l == 0)
+        {
+            continue;
+        }
+
+        for (i=0; i< (sizeof(__cpu_mask)*8); i++)
+        {
+            if (l&(1UL<<i))
+            {
+            s++;
+            }
+        }
     }
-  }
 
-  return s;
+    return s;
 }
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 int str2int(const char* str)
 {
-  char* endptr;
-  errno = 0;
-  unsigned long val;
-  val = strtoul(str, &endptr, 10);
-  if ((errno == ERANGE && val == LONG_MAX )
-      || (errno != 0 && val == 0))
-  {
-    ERROR;
-  }
-
-  if (endptr == str)
-  {
-    ERROR_PRINT(Cannot parse string %s to digits, str);
-  }
-
-  return (int) val;
+    char* endptr;
+    errno = 0;
+    unsigned long val;
+    val = strtoul(str, &endptr, 10);
+    if ((errno == ERANGE && val == LONG_MAX )
+        || (errno != 0 && val == 0))
+    {
+        ERROR;
+    }
+
+    if (endptr == str)
+    {
+        ERROR_PRINT(Cannot parse string %s to digits, str);
+    }
+
+    return (int) val;
 }
 
 uint32_t
 bstr_to_cpuset_physical(uint32_t* threads,  const_bstring q)
 {
-  int i;
-  unsigned int rangeBegin;
-  unsigned int rangeEnd;
-  uint32_t numThreads=0;
-  struct bstrList* tokens;
-  struct bstrList* subtokens;
-
-  tokens = bsplit(q,',');
+    int i;
+    unsigned int rangeBegin;
+    unsigned int rangeEnd;
+    uint32_t numThreads=0;
+    struct bstrList* tokens;
+    struct bstrList* subtokens;
 
-  for (i=0;i<tokens->qty;i++)
-  {
-    subtokens = bsplit(tokens->entry[i],'-');
+    tokens = bsplit(q,',');
 
-    if (numThreads > MAX_NUM_THREADS)
+    for (i=0;i<tokens->qty;i++)
     {
-      ERROR_PRINT(Number Of threads %d too large, numThreads);
-    }
+        subtokens = bsplit(tokens->entry[i],'-');
 
-    if( subtokens->qty == 1 )
-    {
-      threads[numThreads] = str2int((char *) bdata(subtokens->entry[0]));
-      numThreads++;
-    }
-    else if ( subtokens->qty == 2 )
-    {
-      rangeBegin = str2int((char*) bdata(subtokens->entry[0]));
-      rangeEnd = str2int((char*) bdata(subtokens->entry[1]));
+        if( subtokens->qty == 1 )
+        {
+            threads[numThreads] = str2int((char *) bdata(subtokens->entry[0]));
+            numThreads++;
+        }
+        else if ( subtokens->qty == 2 )
+        {
+            rangeBegin = str2int((char*) bdata(subtokens->entry[0]));
+            rangeEnd = str2int((char*) bdata(subtokens->entry[1]));
 
-      if (!(rangeBegin <= rangeEnd))
-      {
-        ERROR_PRINT(Range End %d bigger than begin %d,rangeEnd,rangeBegin);
-      }
+            if (!(rangeBegin <= rangeEnd))
+            {
+                ERROR_PRINT(Range End %d bigger than begin %d, rangeEnd, rangeBegin);
+            }
 
-      while (rangeBegin <= rangeEnd) {
-        threads[numThreads] = rangeBegin;
-        numThreads++;
-        rangeBegin++;
-      }
+            while (rangeBegin <= rangeEnd) {
+                threads[numThreads] = rangeBegin;
+                numThreads++;
+                rangeBegin++;
+            }
+        }
+        else
+        {
+            ERROR_PLAIN_PRINT(Parse Error);
+        }
+        bstrListDestroy(subtokens);
     }
-    else
+    if (numThreads > MAX_NUM_THREADS)
     {
-      ERROR_PLAIN_PRINT(Parse Error);
+        ERROR_PRINT(Number Of threads %d too large, numThreads);
     }
-    bstrListDestroy(subtokens);
-  }
 
-  bstrListDestroy(tokens);
+    bstrListDestroy(tokens);
 
-  return numThreads;
+    return numThreads;
 }
 
 uint32_t
 bstr_to_cpuset_logical(uint32_t* threads,  const_bstring q)
 {
-  int i;
-  uint32_t j;
-  int id;
-  uint32_t tmpThreads[MAX_NUM_THREADS];
-  int globalNumThreads=0;
-  uint32_t numThreads=0;
-  struct bstrList* tokens;
-  struct bstrList* subtokens;
-  const AffinityDomain* domain;
-
-  tokens = bsplit(q,'@');
-
-  for (i=0;i<tokens->qty;i++)
-  {
-    subtokens = bsplit(tokens->entry[i],':');
+    int i;
+    uint32_t j;
+    int id;
+    uint32_t tmpThreads[MAX_NUM_THREADS];
+    int globalNumThreads=0;
+    uint32_t numThreads=0;
+    struct bstrList* tokens;
+    struct bstrList* subtokens;
+    const AffinityDomain* domain;
 
-    if ( subtokens->qty == 2 )
+    tokens = bsplit(q,'@');
+
+    for (i=0;i<tokens->qty;i++)
     {
-      domain =  affinity_getDomain(subtokens->entry[0]);
+        subtokens = bsplit(tokens->entry[i],':');
 
-      if (!domain)
-      {
-        ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
-      }
+        if ( subtokens->qty == 2 )
+        {
+            domain =  affinity_getDomain(subtokens->entry[0]);
 
-      numThreads = bstr_to_cpuset_physical(tmpThreads, subtokens->entry[1]);
+            if (!domain)
+            {
+                ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
+            }
 
-      for (j=0; j<numThreads; j++)
-      {
-        if (! (tmpThreads[j] >= domain->numberOfProcessors))
-        {
-          id = (tmpThreads[j]/domain->numberOfCores) +
-            (tmpThreads[j]%domain->numberOfCores) * cpuid_topology.numThreadsPerCore;
-          threads[globalNumThreads++] = domain->processorList[id];
+            numThreads = bstr_to_cpuset_physical(tmpThreads, subtokens->entry[1]);
+
+            for (j=0; j<numThreads; j++)
+                {
+                if (! (tmpThreads[j] >= domain->numberOfProcessors))
+                {
+                    id = (tmpThreads[j]/domain->numberOfCores) +
+                        (tmpThreads[j]%domain->numberOfCores) * cpuid_topology.numThreadsPerCore;
+                    threads[globalNumThreads++] = domain->processorList[id];
+                }
+                else
+                {
+                    ERROR_PRINT(Too many threads requested. Avaialable 0-%d,domain->numberOfProcessors-1);
+                }
+            }
         }
         else
         {
-          ERROR_PRINT(Too many threads requested. Avaialable 0-%d,domain->numberOfProcessors-1);
+            ERROR_PLAIN_PRINT(Parse Error);
         }
-      }
+        bstrListDestroy(subtokens);
     }
-    else
-    {
-      ERROR_PLAIN_PRINT(Parse Error);
-    }
-    bstrListDestroy(subtokens);
-  }
 
-  bstrListDestroy(tokens);
+    bstrListDestroy(tokens);
 
-  return globalNumThreads;
+    return globalNumThreads;
 }
 
 #define PRINT_EXPR_ERR printf("SYNTAX ERROR: Expression must have the format E:<thread domain>:<num threads>[:chunk size>:<stride>]\n")
@@ -208,94 +207,92 @@ bstr_to_cpuset_logical(uint32_t* threads,  const_bstring q)
 uint32_t
 bstr_to_cpuset_expression(uint32_t* threads,  const_bstring qi)
 {
-  int i;
-  uint32_t j;
-  bstring q = (bstring) qi;
-  int globalNumThreads=0;
-  uint32_t numThreads=0;
-  struct bstrList* tokens;
-  struct bstrList* subtokens;
-  const AffinityDomain* domain;
-
-  bdelete (q, 0, 2);
-  tokens = bsplit(q,'@');
-
-  for (i=0;i<tokens->qty;i++)
-  {
-    subtokens = bsplit(tokens->entry[i],':');
+    int i;
+    uint32_t j;
+    bstring q = (bstring) qi;
+    int globalNumThreads=0;
+    uint32_t numThreads=0;
+    struct bstrList* tokens;
+    struct bstrList* subtokens;
+    const AffinityDomain* domain;
 
-    if ( subtokens->qty == 2 )
+    bdelete (q, 0, 2);
+    tokens = bsplit(q,'@');
+
+    for (i=0;i<tokens->qty;i++)
     {
-      domain =  affinity_getDomain(subtokens->entry[0]);
+        subtokens = bsplit(tokens->entry[i],':');
 
-      if (!domain)
-      {
-        ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
-      }
+        if ( subtokens->qty == 2 )
+        {
+            domain =  affinity_getDomain(subtokens->entry[0]);
 
-      numThreads = str2int(bdata(subtokens->entry[1]));
+            if (!domain)
+            {
+                ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
+            }
 
-      if (numThreads > domain->numberOfProcessors)
-      {
-        ERROR_PRINT(Invalid processor id requested. Avaialable 0-%d,domain->numberOfProcessors-1);
-      }
+            numThreads = str2int(bdata(subtokens->entry[1]));
 
-      for (j=0; j<numThreads; j++)
-      {
-          threads[globalNumThreads++] = domain->processorList[j];
-      }
-    }
-    else if ( subtokens->qty == 4 )
-    {
-      int counter;
-      int currentId = 0;
-      int startId = 0;
-      int chunksize =  str2int(bdata(subtokens->entry[2]));
-      int stride =  str2int(bdata(subtokens->entry[3]));
-      domain =  affinity_getDomain(subtokens->entry[0]);
+            if (numThreads > domain->numberOfProcessors)
+            {
+                ERROR_PRINT(Invalid processor id requested. Avaialable 0-%d,
+                            domain->numberOfProcessors-1);
+            }
 
-      if (!domain)
-      {
-        ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
-      }
+            for (j=0; j<numThreads; j++)
+            {
+                threads[globalNumThreads++] = domain->processorList[j];
+            }
+        }
+        else if ( subtokens->qty == 4 )
+        {
+            int counter;
+            int currentId = 0;
+            int startId = 0;
+            int chunksize =  str2int(bdata(subtokens->entry[2]));
+            int stride =  str2int(bdata(subtokens->entry[3]));
+            domain = affinity_getDomain(subtokens->entry[0]);
+
+            if (!domain)
+            {
+                ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
+            }
 
-      numThreads = str2int(bdata(subtokens->entry[1]));
+            numThreads = str2int(bdata(subtokens->entry[1]));
 
-      if (numThreads > domain->numberOfProcessors)
-      {
-        ERROR_PRINT(Invalid processor id requested. Avaialable 0-%d,domain->numberOfProcessors-1);
-      }
+            if (numThreads > domain->numberOfProcessors)
+            {
+                ERROR_PRINT(Invalid number of processors requested. Available 0-%d,
+                            domain->numberOfProcessors-1);
+            }
 
-      counter = chunksize;
 
-      for (j=0; j<numThreads; j++)
-      {
-        if (counter)
-        {
-          threads[globalNumThreads++] = domain->processorList[currentId++];
+            counter = 0;
+            for (j=0; j<numThreads; j+=chunksize)
+            {
+                for(i=0;i<chunksize && j+i<numThreads ;i++)
+                {
+                    threads[globalNumThreads++] = domain->processorList[counter+i];
+                }
+                counter += stride;
+                if (counter >= domain->numberOfProcessors)
+                {
+                    counter = 0;
+                }
+            }
         }
         else
         {
-          startId += stride;
-          if (startId >= numThreads) startId -= numThreads;
-          currentId = startId;
-          threads[globalNumThreads++] = domain->processorList[currentId++];
-          counter = chunksize;
+            PRINT_EXPR_ERR;
+            ERROR_PLAIN_PRINT(Parse Error);
         }
-        counter--;
-      }
+        bstrListDestroy(subtokens);
     }
-    else
-    {
-      PRINT_EXPR_ERR;
-      ERROR_PLAIN_PRINT(Parse Error);
-    }
-    bstrListDestroy(subtokens);
-  }
 
-  bstrListDestroy(tokens);
+    bstrListDestroy(tokens);
 
-  return globalNumThreads;
+    return globalNumThreads;
 }
 
 uint32_t
@@ -365,310 +362,319 @@ bstr_to_cpuset_scatter(uint32_t* threads,  const_bstring qi)
 
 
 #define CPUSET_ERROR  \
-  if (cpuid_isInCpuset()) {  \
-    ERROR_PLAIN_PRINT(You are running inside a cpuset. \
-            In cpusets only logical pinning inside set is allowed!);  \
-  }
+    if (cpuid_isInCpuset()) {  \
+        ERROR_PLAIN_PRINT(You are running inside a cpuset. In cpusets only logical pinning inside set is allowed!);  \
+    }
 
 
 
 int
 bstr_to_cpuset(int* threadsIN,  const_bstring q)
 {
-  uint32_t i;
-  int num=0;
-  int cpuMapping[cpuid_topology.numHWThreads];
-  cpu_set_t cpu_set;
-  uint32_t numThreads;
-  bstring domainStr = bformat("NSCM");
-  const_bstring  scatter = bformat("scatter");
-  struct bstrList* tokens;
-  CPU_ZERO(&cpu_set);
-  sched_getaffinity(0,sizeof(cpu_set_t), &cpu_set);
-  uint32_t* threads = (uint32_t*) threadsIN;
-
-  if (binchr (q, 0, domainStr) !=  BSTR_ERR)
-  {
-    CPUSET_ERROR;
-
-    if (binstr (q, 0 , scatter ) !=  BSTR_ERR)
-    {
-      numThreads =  bstr_to_cpuset_scatter(threads,q);
+    uint32_t i;
+    int num=0;
+    int cpuMapping[cpuid_topology.numHWThreads];
+    cpu_set_t cpu_set;
+    uint32_t numThreads;
+    bstring domainStr = bformat("NSCM");
+    const_bstring  scatter = bformat("scatter");
+    struct bstrList* tokens;
+    CPU_ZERO(&cpu_set);
+    sched_getaffinity(0,sizeof(cpu_set_t), &cpu_set);
+    uint32_t* threads = (uint32_t*) threadsIN;
+
+    if (binchr (q, 0, domainStr) !=  BSTR_ERR)
+    {
+        CPUSET_ERROR;
+
+        if (binstr (q, 0 , scatter ) !=  BSTR_ERR)
+        {
+          numThreads =  bstr_to_cpuset_scatter(threads,q);
+        }
+        else if (bstrchr (q, 'E') !=  BSTR_ERR)
+        {
+          numThreads =  bstr_to_cpuset_expression(threads,q);
+        }
+        else
+        {
+          numThreads =  bstr_to_cpuset_logical(threads,q);
+        }
     }
-    else if (bstrchr (q, 'E') !=  BSTR_ERR)
+    else if (bstrchr (q, 'L') !=  BSTR_ERR)
     {
-      numThreads =  bstr_to_cpuset_expression(threads,q);
+        uint32_t count = cpu_count(&cpu_set);
+
+        tokens = bsplit(q,':');
+        numThreads = bstr_to_cpuset_physical(threads,tokens->entry[1]);
+
+        for (i=0; i <  cpuid_topology.numHWThreads; i++)
+        {
+            if (CPU_ISSET(i,&cpu_set))
+            {
+                cpuMapping[num++]=i;
+            }
+        }
+
+        for (i=0; i < numThreads; i++)
+        {
+            if (!(threads[i] >= count))
+            {
+                threads[i] = cpuMapping[threads[i]];
+            }
+            else
+            {
+                fprintf(stderr, "Available CPUs: ");
+                for (int j=0; j< num-1;j++)
+                {
+                    fprintf(stderr, "%d,", cpuMapping[j]);
+                }
+                fprintf(stderr, "%d\n", cpuMapping[num-1]);
+                ERROR_PRINT(Index %d out of range.,threads[i]);
+            }
+        }
+        bstrListDestroy(tokens);
     }
     else
     {
-      numThreads =  bstr_to_cpuset_logical(threads,q);
+        CPUSET_ERROR;
+        numThreads = bstr_to_cpuset_physical(threads,q);
     }
-  }
-  else if (bstrchr (q, 'L') !=  BSTR_ERR)
-  {
-    uint32_t count = cpu_count(&cpu_set);
-    printf("Using logical numbering within cpuset %d\n",count);
-    tokens = bsplit(q,':');
-    numThreads = bstr_to_cpuset_physical(threads,tokens->entry[1]);
 
-    for (i=0; i <  cpuid_topology.numHWThreads; i++)
-    {
-      if (CPU_ISSET(i,&cpu_set))
-      {
-        cpuMapping[num++]=i;
-      }
-    }
-
-    for (i=0; i < numThreads; i++)
-    {
-      if (!(threads[i] > count))
-      {
-        threads[i] = cpuMapping[threads[i]];
-      }
-      else
-      {
-        ERROR_PRINT(Request cpu out of range of max %d,count);
-      }
-    }
-    bstrListDestroy(tokens);
-  }
-  else
-  {
-    CPUSET_ERROR;
-    numThreads = bstr_to_cpuset_physical(threads,q);
-  }
-
-  bdestroy(domainStr);
-  return (int) numThreads;
+    bdestroy(domainStr);
+    return (int) numThreads;
 }
 
 
 void
 bstr_to_eventset(StrUtilEventSet* set, const_bstring q)
 {
-  int i;
-  struct bstrList* tokens;
-  struct bstrList* subtokens;
+    int i;
+    struct bstrList* tokens;
+    struct bstrList* subtokens;
 
-  tokens = bsplit(q,',');
-  set->numberOfEvents = tokens->qty;
-  set->events = (StrUtilEvent*)
+    tokens = bsplit(q,',');
+    set->numberOfEvents = tokens->qty;
+    set->events = (StrUtilEvent*)
     malloc(set->numberOfEvents * sizeof(StrUtilEvent));
 
-  for (i=0;i<tokens->qty;i++)
-  {
-    subtokens = bsplit(tokens->entry[i],':');
-
-    if ( subtokens->qty != 2 )
-    {
-      
-      fprintf(stderr, "Cannot parse event string %s, probably missing counter name\n"
-      					,bdata(tokens->entry[i]));
-      fprintf(stderr, "Format: <eventName>:<counter>,...\n");
-      msr_finalize();
-      pci_finalize();
-      exit(EXIT_FAILURE);
-      //ERROR_PLAIN_PRINT(Error in parsing event string);
-    }
-    else
+    for (i=0;i<tokens->qty;i++)
     {
-      set->events[i].eventName = bstrcpy(subtokens->entry[0]);
-      set->events[i].counterName = bstrcpy(subtokens->entry[1]);
-    }
+        subtokens = bsplit(tokens->entry[i],':');
 
-    bstrListDestroy(subtokens);
-  }
+        if ( subtokens->qty != 2 )
+        {
+          
+            fprintf(stderr, "Cannot parse event string %s, probably missing counter name\n"
+                          ,bdata(tokens->entry[i]));
+            fprintf(stderr, "Format: <eventName>:<counter>,...\n");
+            msr_finalize();
+            pci_finalize();
+            exit(EXIT_FAILURE);
 
-  bstrListDestroy(tokens);
+        }
+        else
+        {
+            set->events[i].eventName = bstrcpy(subtokens->entry[0]);
+            set->events[i].counterName = bstrcpy(subtokens->entry[1]);
+        }
+
+        bstrListDestroy(subtokens);
+    }
+
+    bstrListDestroy(tokens);
 }
 
 FILE*
 bstr_to_outstream(const_bstring argString, bstring filter)
 {
-  int i;
-  char* cstr;
-  FILE* STREAM;
-  struct bstrList* tokens;
-  bstring base;
-  bstring suffix = bfromcstr(".");
-  bstring filename;
-
-  /* configure filter */
-  {
+    int i;
+    char* cstr;
+    FILE* STREAM;
+    struct bstrList* tokens;
+    bstring base;
+    bstring suffix = bfromcstr(".");
+    bstring filename;
+
+    /* configure filter */
     tokens = bsplit(argString,'.');
 
     if (tokens->qty < 2)
     {
-      fprintf(stderr, "Outputfile has no filetype suffix!\n");
-      fprintf(stderr, "Add suffix .txt for raw output or any supported filter suffix.\n");
-      exit(EXIT_FAILURE);
+        fprintf(stderr, "Outputfile has no filetype suffix!\n");
+        fprintf(stderr, "Add suffix .txt for raw output or any supported filter suffix.\n");
+        exit(EXIT_FAILURE);
     }
 
     base = bstrcpy(tokens->entry[0]);
 
     if (biseqcstr(tokens->entry[1],"txt"))
     {
-      bassigncstr(filter, "NO");
+        bassigncstr(filter, "NO");
     }
     else
     {
-      bassigncstr(filter, TOSTRING(LIKWIDFILTERPATH));
-      bconchar(filter,'/');
-      bconcat(filter,tokens->entry[1]);
+        bassigncstr(filter, TOSTRING(LIKWIDFILTERPATH));
+        bconchar(filter,'/');
+        bconcat(filter,tokens->entry[1]);
     }
 
     bconcat(suffix,tokens->entry[1]);
     bstrListDestroy(tokens);
-  }
-
-  tokens = bsplit(base,'_');
-
-  if (tokens->qty < 1)
-  {
-    ERROR_PLAIN_PRINT(Error in parsing file string);
-  }
-
-  filename = bstrcpy(tokens->entry[0]);
-
-  for (i=1; i<tokens->qty; i++)
-  {
-    if (biseqcstr(tokens->entry[i],"%j"))
-    {
-      cstr = getenv("PBS_JOBID");
-      if (cstr != NULL) 
-      {
-        bcatcstr(filename, "_");
-        bcatcstr(filename, cstr);
-      }
-    }
-    else if (biseqcstr(tokens->entry[i],"%r"))
-    {
-      cstr = getenv("PMI_RANK");
-      if (cstr == NULL) 
-      {
-        cstr = getenv("OMPI_COMM_WORLD_RANK");
-      }
-      if (cstr != NULL) 
-      {
-        bcatcstr(filename, "_");
-        bcatcstr(filename, cstr);
-      }
-    }
-    else if (biseqcstr(tokens->entry[i],"%h"))
-    {
-      cstr = (char*) malloc(HOST_NAME_MAX * sizeof(char));
-      gethostname(cstr,HOST_NAME_MAX);
-      bcatcstr(filename, "_");
-      bcatcstr(filename, cstr);
-      free(cstr);
-    }
-    else if (biseqcstr(tokens->entry[i],"%p"))
-    {
-      bstring pid = bformat("_%d",getpid());
-      bconcat(filename, pid);
-      bdestroy(pid);
-    }
-    else 
-    {
-      ERROR_PLAIN_PRINT(Unsupported placeholder in filename!);
-    }
-  }
-
-  if (biseqcstr(filter,"NO"))
-  {
-    bconcat(filename, suffix);
-  }
-  else
-  {
-    bcatcstr(filter, " ");
-    bcatcstr(filename, ".tmp");
-    bconcat(filter, filename);
-  }
-
-  bstrListDestroy(tokens);
-  STREAM = fopen(bdata(filename),"w");
-  bdestroy(filename);
-  bdestroy(suffix);
-  bdestroy(base);
-
-  return STREAM;
+
+    tokens = bsplit(base,'_');
+
+    if (tokens->qty < 1)
+    {
+        ERROR_PLAIN_PRINT(Error in parsing file string);
+    }
+
+    filename = bstrcpy(tokens->entry[0]);
+
+    for (i=1; i<tokens->qty; i++)
+    {
+        if (biseqcstr(tokens->entry[i],"%j"))
+        {
+            cstr = getenv("PBS_JOBID");
+            if (cstr != NULL) 
+            {
+                bcatcstr(filename, "_");
+                bcatcstr(filename, cstr);
+            }
+        }
+        else if (biseqcstr(tokens->entry[i],"%r"))
+        {
+            cstr = getenv("PMI_RANK");
+            if (cstr == NULL) 
+            {
+                cstr = getenv("OMPI_COMM_WORLD_RANK");
+            }
+            if (cstr != NULL) 
+            {
+                bcatcstr(filename, "_");
+                bcatcstr(filename, cstr);
+            }
+        }
+        else if (biseqcstr(tokens->entry[i],"%h"))
+        {
+            cstr = (char*) malloc(HOST_NAME_MAX * sizeof(char));
+            gethostname(cstr,HOST_NAME_MAX);
+            bcatcstr(filename, "_");
+            bcatcstr(filename, cstr);
+            free(cstr);
+        }
+        else if (biseqcstr(tokens->entry[i],"%p"))
+        {
+            bstring pid = bformat("_%d",getpid());
+            bconcat(filename, pid);
+            bdestroy(pid);
+        }
+        else 
+        {
+            ERROR_PLAIN_PRINT(Unsupported placeholder in filename!);
+        }
+    }
+
+    if (biseqcstr(filter,"NO"))
+    {
+        bconcat(filename, suffix);
+    }
+    else
+    {
+        bcatcstr(filter, " ");
+        bcatcstr(filename, ".tmp");
+        bconcat(filter, filename);
+    }
+
+    bstrListDestroy(tokens);
+    STREAM = fopen(bdata(filename),"w");
+    bdestroy(filename);
+    bdestroy(suffix);
+    bdestroy(base);
+
+    return STREAM;
 }
 
 
 uint64_t
 bstr_to_doubleSize(const_bstring str, DataType type)
 {
-  bstring unit = bmidstr(str, blength(str)-2, 2);
-  bstring sizeStr = bmidstr(str, 0, blength(str)-2);
-  uint64_t sizeU = str2int(bdata(sizeStr));
-  uint64_t junk = 0;
-  uint64_t bytesize = 0;
-
-  switch (type)
-  {
-    case SINGLE:
-      bytesize = sizeof(float);
-      break;
-
-    case DOUBLE:
-      bytesize = sizeof(double);
-      break;
-  }
-
-  if (biseqcstr(unit, "kB")) {
-    junk = (sizeU *1024)/bytesize;
-  } else if (biseqcstr(unit, "MB")) {
-    junk = (sizeU *1024*1024)/bytesize;
-  } else if (biseqcstr(unit, "GB")) {
-    junk = (sizeU *1024*1024*1024)/bytesize;
-  }
-
-  return junk;
+    bstring unit = bmidstr(str, blength(str)-2, 2);
+    bstring sizeStr = bmidstr(str, 0, blength(str)-2);
+    uint64_t sizeU = str2int(bdata(sizeStr));
+    uint64_t junk = 0;
+    uint64_t bytesize = 0;
+
+    switch (type)
+    {
+        case SINGLE:
+        case SINGLE_RAND:
+            bytesize = sizeof(float);
+            break;
+
+        case DOUBLE:
+        case DOUBLE_RAND:
+            bytesize = sizeof(double);
+            break;
+    }
+
+    if (biseqcstr(unit, "kB")) {
+        junk = (sizeU *1024)/bytesize;
+    } else if (biseqcstr(unit, "MB")) {
+        junk = (sizeU *1024*1024)/bytesize;
+    } else if (biseqcstr(unit, "GB")) {
+        junk = (sizeU *1024*1024*1024)/bytesize;
+    }
+
+    return junk;
 }
 
 void
 bstr_to_interval(const_bstring str, struct timespec* interval)
 {
-  int size;
-  int pos;
-  bstring ms = bformat("ms");
-
-  if ((pos = bstrrchr (str, 's')) != BSTR_ERR)
-  {
-    if (pos != (blength(str)-1))
-    {
-      ERROR_PLAIN_PRINT(Parsing of daemon interval failed);
-    }
-
-    /* unit is ms */
-    if (binstrr (str, blength(str), ms) != BSTR_ERR)
-    {
-      bstring sizeStr = bmidstr(str, 0, blength(str)-2);
-      size = str2int(bdata(sizeStr));
-      if (size >= 1000)
-      {
-        interval->tv_sec = size/1000;
-        interval->tv_nsec = (size%1000) * 1.E06;
-      }
-      else
-      {
-        interval->tv_sec = 0L;
-        interval->tv_nsec = size * 1.E06;
-      }
-    }
-    /* unit is s */
-    else 
-    {
-      bstring sizeStr = bmidstr(str, 0, blength(str)-1);
-      size = str2int(bdata(sizeStr));
-      interval->tv_sec = size;
-      interval->tv_nsec = 0L;
-    }
-  }
-  else
-  {
-    ERROR_PLAIN_PRINT(Parsing of daemon interval failed);
-  }
+    int size;
+    int pos;
+    bstring ms = bformat("ms");
+
+    if ((pos = bstrrchr (str, 's')) != BSTR_ERR)
+    {
+        if (pos != (blength(str)-1))
+        {
+            fprintf(stderr, "You need to specify a time unit s or ms like 200ms\n");
+            msr_finalize();
+            exit(EXIT_FAILURE);
+        }
+
+        /* unit is ms */
+        if (binstrr (str, blength(str), ms) != BSTR_ERR)
+        {
+            bstring sizeStr = bmidstr(str, 0, blength(str)-2);
+            size = str2int(bdata(sizeStr));
+            if (size >= 1000)
+            {
+                interval->tv_sec = size/1000;
+                interval->tv_nsec = (size%1000) * 1.E06;
+            }
+            else
+            {
+                interval->tv_sec = 0L;
+                interval->tv_nsec = size * 1.E06;
+            }
+        }
+        /* unit is s */
+        else 
+        {
+            bstring sizeStr = bmidstr(str, 0, blength(str)-1);
+            size = str2int(bdata(sizeStr));
+            interval->tv_sec = size;
+            interval->tv_nsec = 0L;
+        }
+    }
+    else
+    {
+        fprintf(stderr, "You need to specify a time unit s or ms like 200ms\n");
+        msr_finalize();
+        exit(EXIT_FAILURE);
+    }
 }
 
 
@@ -678,199 +684,200 @@ bstr_to_workgroup(Workgroup* group,
     DataType type,
     int numberOfStreams)
 {
-  uint32_t i;
-  int parseStreams = 0;
-  bstring threadInfo;
-  bstring streams= bformat("0");
-  struct bstrList* tokens;
-  struct bstrList* subtokens;
-  const AffinityDomain* domain;
-
-  /* split the workgroup into the thread and the streams part */
-  tokens = bsplit(str,'-');
-
-  if (tokens->qty == 2)
-  {
-    threadInfo = bstrcpy(tokens->entry[0]);
-    streams = bstrcpy(tokens->entry[1]);
-    parseStreams = 1;
-  }
-  else if (tokens->qty == 1)
-  {
-    threadInfo = bstrcpy(tokens->entry[0]);
-  }
-  else
-  {
-    ERROR_PLAIN_PRINT(Error in parsing workgroup string);
-  }
-
-  bstrListDestroy (tokens);
-  tokens = bsplit(threadInfo,':');
-
-  if (tokens->qty == 5)
-  {
-    uint32_t maxNumThreads;
-    int chunksize;
-    int stride;
-    int counter;
-    int currentId = 0;
-    int startId = 0;
+    uint32_t i;
+    int parseStreams = 0;
+    bstring threadInfo;
+    bstring streams= bformat("0");
+    struct bstrList* tokens;
+    struct bstrList* subtokens;
+    const AffinityDomain* domain;
 
-    domain = affinity_getDomain(tokens->entry[0]);
+    /* split the workgroup into the thread and the streams part */
+    tokens = bsplit(str,'-');
 
-    if (domain == NULL)
+    if (tokens->qty == 2)
     {
-      fprintf(stderr, "Error: Domain %s not available on current machine.\nTry likwid-bench -p for supported domains.",
-          bdata(tokens->entry[0]));
-      exit(EXIT_FAILURE);
+        threadInfo = bstrcpy(tokens->entry[0]);
+        streams = bstrcpy(tokens->entry[1]);
+        parseStreams = 1;
+    }
+    else if (tokens->qty == 1)
+    {
+        threadInfo = bstrcpy(tokens->entry[0]);
+    }
+    else
+    {
+        ERROR_PLAIN_PRINT(Error in parsing workgroup string);
     }
 
-    group->size = bstr_to_doubleSize(tokens->entry[1], type);
-    group->numberOfThreads = str2int(bdata(tokens->entry[2]));
-    chunksize = str2int(bdata(tokens->entry[3]));
-    stride = str2int(bdata(tokens->entry[4]));
-    maxNumThreads = (domain->numberOfProcessors / stride) * chunksize;
+    bstrListDestroy (tokens);
+    tokens = bsplit(threadInfo,':');
 
-    if (group->numberOfThreads > maxNumThreads)
+    if (tokens->qty == 5)
     {
-      fprintf(stderr, "Error: Domain %s supports only up to %d threads with used expression.\n",
-          bdata(tokens->entry[0]), maxNumThreads);
-      exit(EXIT_FAILURE);
-    }
+        uint32_t maxNumThreads;
+        int chunksize;
+        int stride;
+        int counter;
+        int currentId = 0;
+        int startId = 0;
+
+        domain = affinity_getDomain(tokens->entry[0]);
 
-    group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
+        if (domain == NULL)
+        {
+          fprintf(stderr, "Error: Domain %s not available on current machine.\nTry likwid-bench -p for supported domains.",
+              bdata(tokens->entry[0]));
+          exit(EXIT_FAILURE);
+        }
 
-    counter = chunksize;
+        group->size = bstr_to_doubleSize(tokens->entry[1], type);
+        group->numberOfThreads = str2int(bdata(tokens->entry[2]));
+        chunksize = str2int(bdata(tokens->entry[3]));
+        stride = str2int(bdata(tokens->entry[4]));
+        maxNumThreads = (domain->numberOfProcessors / stride) * chunksize;
 
-    for (i=0; i<group->numberOfThreads; i++)
-    {
-      if (counter)
-      {
-        group->processorIds[i] = domain->processorList[currentId++];
-      }
-      else
-      {
-        startId += stride;
-        currentId = startId;
-        group->processorIds[i] = domain->processorList[currentId++];
-        counter = chunksize;
-      }
-      counter--;
-    }
-  }
-  else if (tokens->qty == 3)
-  {
-    domain = affinity_getDomain(tokens->entry[0]);
+        if (group->numberOfThreads > maxNumThreads)
+        {
+          fprintf(stderr, "Error: Domain %s supports only up to %d threads with used expression.\n",
+                        bdata(tokens->entry[0]), maxNumThreads);
+          exit(EXIT_FAILURE);
+        }
 
-    if (domain == NULL)
-    {
-      fprintf(stderr, "Error: Domain %s not available on current machine.\nTry likwid-bench -p for supported domains.",
-          bdata(tokens->entry[0]));
-      exit(EXIT_FAILURE);
-    }
+        group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
 
-    group->size = bstr_to_doubleSize(tokens->entry[1], type);
-    group->numberOfThreads = str2int(bdata(tokens->entry[2]));
+        counter = chunksize;
 
-    if (group->numberOfThreads > domain->numberOfProcessors)
-    {
-      fprintf(stderr, "Error: Domain %s supports only up to %d threads.\n",
-          bdata(tokens->entry[0]),domain->numberOfProcessors);
-      exit(EXIT_FAILURE);
+        for (i=0; i<group->numberOfThreads; i++)
+        {
+            if (counter)
+            {
+                group->processorIds[i] = domain->processorList[currentId++];
+            }
+            else
+            {
+                startId += stride;
+                currentId = startId;
+                group->processorIds[i] = domain->processorList[currentId++];
+                counter = chunksize;
+            }
+            counter--;
+        }
     }
+    else if (tokens->qty == 3)
+    {
+        domain = affinity_getDomain(tokens->entry[0]);
 
-    group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
+        if (domain == NULL)
+        {
+            fprintf(stderr, "Error: Domain %s not available on current machine.\n", bdata(tokens->entry[0]));
+            fprintf(stderr, "Try likwid-bench -p for supported domains.\n");
+            exit(EXIT_FAILURE);
+        }
 
-    for (i=0; i<group->numberOfThreads; i++)
-    {
-      group->processorIds[i] = domain->processorList[i];
-    }
-  }
-  else if (tokens->qty == 2)
-  {
-    domain = affinity_getDomain(tokens->entry[0]);
+        group->size = bstr_to_doubleSize(tokens->entry[1], type);
+        group->numberOfThreads = str2int(bdata(tokens->entry[2]));
 
-    if (domain == NULL)
-    {
-      fprintf(stderr, "Error: Domain %s not available on current machine.\nTry likwid-bench -p for supported domains.",
-          bdata(tokens->entry[0]));
-      exit(EXIT_FAILURE);
-    }
+        if (group->numberOfThreads > domain->numberOfProcessors)
+        {
+            fprintf(stderr, "Error: Domain %s supports only up to %d threads.\n",
+                            bdata(tokens->entry[0]),domain->numberOfProcessors);
+            exit(EXIT_FAILURE);
+        }
 
-    group->size = bstr_to_doubleSize(tokens->entry[1], type);
-    group->numberOfThreads = domain->numberOfProcessors;
-    group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
+        group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
 
-    for (i=0; i<group->numberOfThreads; i++)
-    {
-      group->processorIds[i] = domain->processorList[i];
+        for (i=0; i<group->numberOfThreads; i++)
+        {
+            group->processorIds[i] = domain->processorList[i];
+        }
     }
-  }
-  else
-  {
-    ERROR_PLAIN_PRINT(Error in parsing workgroup string);
-  }
+    else if (tokens->qty == 2)
+    {
+        domain = affinity_getDomain(tokens->entry[0]);
 
-  bstrListDestroy(tokens);
+        if (domain == NULL)
+        {
+            fprintf(stderr, "Error: Domain %s not available on current machine.\n",
+                            bdata(tokens->entry[0]));
+            fprintf(stderr, "Try likwid-bench -p for supported domains.\n");
+            exit(EXIT_FAILURE);
+        }
 
-  /* parse stream list */
-  if (parseStreams)
-  {
-    tokens = bsplit(streams,',');
+        group->size = bstr_to_doubleSize(tokens->entry[1], type);
+        group->numberOfThreads = domain->numberOfProcessors;
+        group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
 
-    if (tokens->qty < numberOfStreams)
+        for (i=0; i<group->numberOfThreads; i++)
+        {
+            group->processorIds[i] = domain->processorList[i];
+        }
+    }
+    else
     {
-      ERROR_PRINT(Testcase requires at least %d streams, numberOfStreams);
+    ERROR_PLAIN_PRINT(Error in parsing workgroup string);
     }
 
-    group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
+    bstrListDestroy(tokens);
 
-    for (i=0;i<(uint32_t) tokens->qty;i++)
+    /* parse stream list */
+    if (parseStreams)
     {
-      subtokens = bsplit(tokens->entry[i],':');
+        tokens = bsplit(streams,',');
 
-      if ( subtokens->qty == 3 )
-      {
-        int index = str2int(bdata(subtokens->entry[0]));
-        if (index >= numberOfStreams)
+        if (tokens->qty < numberOfStreams)
         {
-          ERROR_PRINT(Stream Index %d out of range,index);
+            ERROR_PRINT(Testcase requires at least %d streams, numberOfStreams);
         }
-        group->streams[index].domain = bstrcpy(subtokens->entry[1]);
-        group->streams[index].offset = str2int(bdata(subtokens->entry[2]));
-      }
-      else if ( subtokens->qty == 2 )
-      {
-        int index = str2int(bdata(subtokens->entry[0]));
-        if (index >= numberOfStreams)
+
+        group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
+
+        for (i=0;i<(uint32_t) tokens->qty;i++)
         {
-          ERROR_PRINT(Stream Index %d out of range,index);
-        }
-        group->streams[index].domain = bstrcpy(subtokens->entry[1]);
-        group->streams[index].offset = 0;
-      }
-      else
-      {
-        ERROR_PLAIN_PRINT(Error in parsing event string);
-      }
+            subtokens = bsplit(tokens->entry[i],':');
 
-      bstrListDestroy(subtokens);
-    }
+            if ( subtokens->qty == 3 )
+            {
+                int index = str2int(bdata(subtokens->entry[0]));
+                if (index >= numberOfStreams)
+                {
+                    ERROR_PRINT(Stream Index %d out of range,index);
+                }
+                group->streams[index].domain = bstrcpy(subtokens->entry[1]);
+                group->streams[index].offset = str2int(bdata(subtokens->entry[2]));
+            }
+            else if ( subtokens->qty == 2 )
+            {
+                int index = str2int(bdata(subtokens->entry[0]));
+                if (index >= numberOfStreams)
+                {
+                    ERROR_PRINT(Stream Index %d out of range,index);
+                }
+                group->streams[index].domain = bstrcpy(subtokens->entry[1]);
+                group->streams[index].offset = 0;
+            }
+            else
+            {
+                ERROR_PLAIN_PRINT(Error in parsing event string);
+            }
 
-    bstrListDestroy(tokens);
-  }
-  else
-  {
-    group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
+            bstrListDestroy(subtokens);
+        }
 
-    for (i=0; i< (uint32_t)numberOfStreams; i++)
+        bstrListDestroy(tokens);
+    }
+    else
     {
-      group->streams[i].domain = domain->tag;
-      group->streams[i].offset = 0;
+        group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
+
+        for (i=0; i< (uint32_t)numberOfStreams; i++)
+        {
+            group->streams[i].domain = domain->tag;
+            group->streams[i].offset = 0;
+        }
     }
-  }
 
-  group->size /= numberOfStreams;
+    group->size /= numberOfStreams;
 }
 
 
@@ -878,91 +885,91 @@ bstr_to_workgroup(Workgroup* group,
 
 bstring
 bSecureInput (int maxlen, char* vgcCtx) {
-  int i, m, c = 1;
-  bstring b, t;
-  int termchar = 0;
+    int i, m, c = 1;
+    bstring b, t;
+    int termchar = 0;
 
-  if (!vgcCtx) return NULL;
+    if (!vgcCtx) return NULL;
 
-  b = bfromcstralloc (INIT_SECURE_INPUT_LENGTH, "");
+    b = bfromcstralloc (INIT_SECURE_INPUT_LENGTH, "");
 
-  for (i=0; ; i++)
-  {
-    if (termchar == c)
-    {
-      break;
-    }
-    else if ((maxlen > 0) && (i >= maxlen))
-    {
-      b = NULL;
-      return b;
-    }
-    else
+    for (i=0; ; i++)
     {
-      c = *(vgcCtx++);
-    }
+        if (termchar == c)
+        {
+            break;
+        }
+        else if ((maxlen > 0) && (i >= maxlen))
+        {
+            b = NULL;
+            return b;
+        }
+        else
+        {
+            c = *(vgcCtx++);
+        }
 
-    if (EOF == c)
-    {
-      break;
-    }
+        if (EOF == c)
+        {
+            break;
+        }
 
-    if (i+1 >= b->mlen) {
+        if (i+1 >= b->mlen) {
 
-      /* Double size, but deal with unusual case of numeric
-         overflows */
+            /* Double size, but deal with unusual case of numeric
+             overflows */
 
-      if ((m = b->mlen << 1)   <= b->mlen     &&
-          (m = b->mlen + 1024) <= b->mlen &&
-          (m = b->mlen + 16)   <= b->mlen &&
-          (m = b->mlen + 1)    <= b->mlen)
-      {
-        t = NULL;
-      }
-      else
-      {
-        t = bfromcstralloc (m, "");
-      }
+            if ((m = b->mlen << 1)   <= b->mlen &&
+                (m = b->mlen + 1024) <= b->mlen &&
+                (m = b->mlen + 16)   <= b->mlen &&
+                (m = b->mlen + 1)    <= b->mlen)
+            {
+                t = NULL;
+            }
+            else
+            {
+                t = bfromcstralloc (m, "");
+            }
 
-      if (t)
-      {
-        memcpy (t->data, b->data, i);
-      }
+            if (t)
+            {
+                memcpy (t->data, b->data, i);
+            }
 
-      bdestroy (b); /* Clean previous buffer */
-      b = t;
-      if (!b)
-      {
-        return b;
-      }
-    }
+            bdestroy (b); /* Clean previous buffer */
+            b = t;
+            if (!b)
+            {
+                return b;
+            }
+        }
 
-    b->data[i] = (unsigned char) c;
-  }
+        b->data[i] = (unsigned char) c;
+    }
 
-  i--;
-  b->slen = i;
-  b->data[i] = (unsigned char) '\0';
-  return b;
+    i--;
+    b->slen = i;
+    b->data[i] = (unsigned char) '\0';
+    return b;
 }
 
 
 int
 bJustifyCenter (bstring b, int width) 
 {
-  unsigned char space  = ' ';
-  int alignSpace = (width - b->slen) / 2;
-  int restSpace = (width - b->slen) % 2;
-  if (width <= 0) return -__LINE__;
+    unsigned char space  = ' ';
+    int alignSpace = (width - b->slen) / 2;
+    int restSpace = (width - b->slen) % 2;
+    if (width <= 0) return -__LINE__;
 
-  if (b->slen <= width)
-  {
-    binsertch (b, 0, alignSpace, space);
-  }
+    if (b->slen <= width)
+    {
+        binsertch (b, 0, alignSpace, space);
+    }
 
-  binsertch (b, b->slen , alignSpace+restSpace, space);
+    binsertch (b, b->slen , alignSpace+restSpace, space);
 
-  return BSTR_OK;
+    return BSTR_OK;
 }
 
 
diff --git a/src/thermal.c b/src/thermal.c
index 45e7d27..0812086 100644
--- a/src/thermal.c
+++ b/src/thermal.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Module implementing Intel TM/TM2 interface
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -68,6 +68,7 @@ void thermal_init(int cpuId)
         flags = 0ULL;
         flags = msr_read(cpuId, MSR_TEMPERATURE_TARGET);
         thermal_info.activationT =  extractBitField(flags,8,16);
+        thermal_info.offset = extractBitField(flags,6,24);
     }
 }
 
diff --git a/src/threads.c b/src/threads.c
index f96f550..87fa2b2 100644
--- a/src/threads.c
+++ b/src/threads.c
@@ -5,8 +5,8 @@
  *
  *      Description:  High level interface to pthreads
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -54,7 +54,7 @@ static int numThreads = 0;
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
 void
-threads_init(int numberOfThreads)
+threads_init(FILE* OUTSTREAM, int numberOfThreads)
 {
     int i;
     numThreads = numberOfThreads;
@@ -68,6 +68,7 @@ threads_init(int numberOfThreads)
         threads_data[i].globalNumberOfThreads = numThreads;
         threads_data[i].globalThreadId = i;
         threads_data[i].threadId = i;
+        threads_data[i].output = OUTSTREAM;
     }
 
     pthread_barrier_init(&threads_barrier, NULL, numThreads);
@@ -102,7 +103,7 @@ threads_createGroups(int numberOfGroups)
     {
         ERROR_PRINT(Not enough threads %d to create %d groups,numThreads,numberOfGroups);
     }
-    else 
+    else
     {
         numThreadsPerGroup = numThreads / numberOfGroups;
     }
@@ -205,11 +206,11 @@ threads_join(void)
 void
 threads_destroy(int numberOfGroups)
 {
-	int i;
+    int i;
     free(threads_data);
     for(i=0;i<numberOfGroups;i++)
     {
-    	free(threads_groups[i].threadIds);
+        free(threads_groups[i].threadIds);
     }
     free(threads_groups);
     free(threads);
diff --git a/src/timer.c b/src/timer.c
index 32a97d4..337c13d 100644
--- a/src/timer.c
+++ b/src/timer.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Implementation of timer module
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -79,19 +79,19 @@ getCpuSpeed(void)
          ((uint64_t)tv1.tv_sec * 1000000 + tv1.tv_usec));
 #endif
 #ifdef _ARCH_PPC
-	FILE *fpipe;
-	char *command="grep timebase /proc/cpuinfo | awk '{ print $3 }'";
-	char buff[256];
-
-	if ( !(fpipe = (FILE*)popen(command,"r")) )
-	{  // If fpipe is NULL
-		perror("Problems with pipe");
-		exit(1);
-	}
+    FILE *fpipe;
+    char *command="grep timebase /proc/cpuinfo | awk '{ print $3 }'";
+    char buff[256];
+
+    if ( !(fpipe = (FILE*)popen(command,"r")) )
+    {  // If fpipe is NULL
+        perror("Problems with pipe");
+        exit(1);
+    }
 
-	fgets(buff, 256, fpipe);
+    fgets(buff, 256, fpipe);
 
-	return (uint64_t)   atoi(buff);
+    return (uint64_t)   atoi(buff);
 #endif
 }
 
@@ -120,7 +120,7 @@ double timer_print( TimerData* time )
     uint64_t cycles;
 
     /* clamp to zero if something goes wrong */
-   if ((time->stop.int64-baseline) < time->start.int64)
+    if ((time->stop.int64-baseline) < time->start.int64)
     {
         cycles = 0ULL;
     }
diff --git a/src/tree.c b/src/tree.c
index e93ecc4..795dd17 100644
--- a/src/tree.c
+++ b/src/tree.c
@@ -5,8 +5,8 @@
  *
  *      Description:  Module implementing a tree data structure
  *
- *      Version:   3.1.2
- *      Released:  2.6.2014
+ *      Version:   3.1.3
+ *      Released:  4.11.2014
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -48,33 +48,33 @@ tree_init(TreeNode** root, int id)
 void
 tree_print(TreeNode* nodePtr)
 {
-  int level = 0;
+    int level = 0;
 
-  if (nodePtr != NULL)
-  {
+    if (nodePtr != NULL)
+    {
 
-    TreeNode* digger;
-    TreeNode* walker;
+        TreeNode* digger;
+        TreeNode* walker;
 
-    digger = nodePtr->llink;
+        digger = nodePtr->llink;
 
-    while (digger != NULL)
-    {
-      printf("\n Level %d:\n", level++);
-      printf("%d ", digger->id);
-      walker = digger->rlink;
+        while (digger != NULL)
+        {
+            printf("\n Level %d:\n", level++);
+            printf("%d ", digger->id);
+            walker = digger->rlink;
 
-      while (walker != NULL)
-      {
-        printf("%d ", walker->id);
-        walker = walker->rlink;
-      }
+            while (walker != NULL)
+            {
+            printf("%d ", walker->id);
+            walker = walker->rlink;
+            }
 
-      digger = digger->llink;
-    }
+            digger = digger->llink;
+        }
 
-    printf("\n ");
-  }
+        printf("\n ");
+    }
 }
 
 void
diff --git a/test/accuracy/Makefile b/test/accuracy/Makefile
new file mode 100644
index 0000000..f84b1cd
--- /dev/null
+++ b/test/accuracy/Makefile
@@ -0,0 +1,25 @@
+LIKWID_PATH=../..
+LIKWID_APP=likwid-bench
+HOST=$(shell hostname -s)
+
+
+all: plain marker
+
+plain:
+	sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = false#'/g $(LIKWID_PATH)/config.mk
+	sed -i -e s/'CPPFLAGS := -DPAPI '/'CPPFLAGS := '/g $(LIKWID_PATH)/Makefile
+	cd $(LIKWID_PATH) && make distclean && make
+	cp $(LIKWID_PATH)/$(LIKWID_APP) $(LIKWID_APP)-plain
+
+marker:
+	sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = true#'/g $(LIKWID_PATH)/config.mk
+	sed -i -e s/'CPPFLAGS := -DPAPI '/'CPPFLAGS := '/g $(LIKWID_PATH)/Makefile
+	cd $(LIKWID_PATH) && make distclean && make
+	cp $(LIKWID_PATH)/$(LIKWID_APP) $(LIKWID_APP)-marker
+papi:
+	sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = false#'/g $(LIKWID_PATH)/config.mk
+	cp $(LIKWID_PATH)/Makefile $(LIKWID_PATH)/Makefile.orig
+	sed -i -e s/'CPPFLAGS := '/'CPPFLAGS := -DPAPI '/g $(LIKWID_PATH)/Makefile
+	cd $(LIKWID_PATH) && make distclean && make
+	cp $(LIKWID_PATH)/$(LIKWID_APP) $(LIKWID_APP)-papi
+	mv $(LIKWID_PATH)/Makefile.orig $(LIKWID_PATH)/Makefile
diff --git a/test/accuracy/README b/test/accuracy/README
new file mode 100644
index 0000000..9dd8a78
--- /dev/null
+++ b/test/accuracy/README
@@ -0,0 +1,18 @@
+LIKWID accuracy tester
+
+likwid-tester and likwid-tester-plot are test applications written in Perl. The likwid-accuracy.py application does the same but is written in Python.
+
+Usage:
+make #build non-instrumentated and LIKWID-instrumentated versions of
+likwid-bench.
+Adjust test files in TESTS.
+Adjust test set file SET.txt or use the -s/--sets switch on commandline.
+likwid-accuracy.py #Runs the tests of all sets and saves results in folder RESULTS/<hostname>
+
+Options for likwid-accuracy.py:
+--pgf: Create a TeX file containing the definition of a PGF plot with suffix .tex -> .pdf
+--grace: Create grace batch file for further manipulation with XMgrace or create plot with gracebat .agr/.bat -> .png
+--gnuplot: Create GNUplot script .plot -> .jpg
+--script: Create a Bash script containing all commands to create all plots using pdflatex, gracebat and gnuplot.
+--scriptname: Set name for Bash script, default is $CWD/create_plots.sh
+--wiki/--only_wiki: Create a Wiki page for the Google Code Wiki including the .png pics found in Google Code Wiki picture path (http://<project>.googlecode.com/svn/wiki/images). 
diff --git a/test/accuracy/likwid-accuracy.py b/test/accuracy/likwid-accuracy.py
new file mode 100755
index 0000000..3d2d63c
--- /dev/null
+++ b/test/accuracy/likwid-accuracy.py
@@ -0,0 +1,533 @@
+#!/usr/bin/env python
+
+import os, sys, os.path
+import re
+import subprocess
+import socket
+import stat
+import getopt
+
+# Needed for Wiki page output
+import glob
+import statistics
+
+bench_plain = "./likwid-bench-plain"
+bench_marker = "./likwid-bench-marker"
+bench_papi = "./likwid-bench-papi"
+perfctr = "../../likwid-perfctr"
+topology = "../../likwid-topology"
+topology_type = re.compile("^CPU type:\s+(.*)")
+topology_sockets = re.compile("^Sockets:\s+(\d+)")
+topology_corespersocket = re.compile("^Cores per socket:\s+(\d+)")
+topology_threadspercore = re.compile("^Threads per core:\s+(\d+)")
+testlist = "SET.txt"
+testfolder = "TESTS"
+resultfolder = "RESULTS"
+hostname = socket.gethostname()
+picture_base = "http://likwid.googlecode.com/svn/wiki/images"
+
+gnu_colors = ["red","blue","green"]#,"black","brown", "gray","violet", "cyan", "magenta","orange","#4B0082","#800000","turquoise","#006400","yellow"]
+gnu_marks = [5,13,9]#,2,3,4,6,7,8,9,10,11,12,14,15]
+
+wiki = False
+papi = False
+only_wiki = False
+sets = []
+out_pgf = False
+out_gnuplot = False
+out_grace = False
+scriptfilename = "create_plots.sh"
+out_script = False
+
+def usage():
+    print "Execute and evaluate accuracy tests for LIKWID with likwid-bench and likwid-perfctr"
+    print
+    print "-h/--help:\tPrint this help text"
+    print "-s/--sets:\tSpecifiy testgroups (comma separated). Can also be set in SET.txt"
+    print "--wiki:\t\tBesides testing write out results in Google code wiki syntax"
+    print "--only_wiki:\tDo not run benchmarks, read results from file and write out results in Google code wiki syntax"
+    print "Picture options:"
+    print "--pgf:\t\tCreate TeX document for each test with PGFPlot"
+    print "--gnuplot:\tCreate GNUPlot script for each test"
+    print "--grace:\tCreate Grace script that can be evaluated with gracebat"
+    print "--script:\tActivate recording of commands in a bash script"
+    print "--scriptname:\tRecord commands to create pictures in file (default: %s)" % (os.path.join(os.path.join(resultfolder,hostname),scriptfilename))
+
+def get_system_info():
+    name = None
+    sockets = 0
+    corespersocket = 0
+    threadspercore = 0
+    
+    p = subprocess.Popen(topology, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    p.wait()
+    if p.returncode != 0:
+        name = "Unknown system"
+        return
+    for line in p.stdout.read().split("\n"):
+        if not line.strip() or line.startswith("*") or line.startswith("-"): continue
+        if line.startswith("CPU type"):
+            name = topology_type.match(line).group(1).strip()
+        if line.startswith("Sockets"):
+            sockets = int(topology_sockets.match(line).group(1))
+        if line.startswith("Cores per socket"):
+            corespersocket = int(topology_corespersocket.match(line).group(1))
+        if line.startswith("Threads per core"):
+            threadspercore = int(topology_threadspercore.match(line).group(1))
+        if name and sockets > 0 and corespersocket > 0 and threadspercore > 0:
+            break
+    return name, sockets, corespersocket, threadspercore
+
+def get_groups():
+    groups = {}
+    p = subprocess.Popen(perfctr+" -a", shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    p.wait()
+    if p.returncode != 0:
+        return groups
+    for line in p.stdout.read().split("\n"):
+        if line.startswith("-") or not line.strip(): continue
+        if line.startswith("Available"): continue
+        name, description = line.split(":")
+        groups[name.strip()] = description.strip()
+    return groups
+
+def get_test_groups(groupdict):
+    groups = {}
+    if len(sets) > 0:
+        setlist = sets
+    else:
+        setfp = open("SET.txt",'r')
+        setlist = setfp.read().strip().split("\n")
+        setfp.close()
+    
+    filelist = glob.glob(testfolder+"/*.txt")
+    for name in setlist:
+        tests = []
+        file = os.path.join(testfolder, name) + ".txt"
+        if not os.path.exists(file): continue
+        fp = open(file,'r')
+        finput = fp.read().strip().split("\n")
+        fp.close()    
+        for line in finput:
+            if line.startswith("TEST"):
+                tests.append(line.split(" ")[1])
+        groups[name] = tests
+                
+            
+    return groups
+    
+def get_values_from_file(file, lineoffset, linecount):
+    results = []
+    fp = open(file,'r')
+    finput = fp.read().strip().split("\n")
+    fp.close()
+    try:
+        for line in finput[lineoffset:lineoffset+linecount]:
+            results.append(float(line.split(" ")[1]))
+    except:
+        print "Cannot read file %s from %d to %d" % (file, lineoffset,lineoffset+linecount, )
+        for line in finput[lineoffset:lineoffset+linecount]:
+            print line
+    return results
+
+def write_pgf(group, test, plain_file, marker_file, papi_file=None, execute=False, script=None):
+    filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".tex")
+    fp = open(filename,'w')
+    fp.write("\documentclass{article}\n")
+    fp.write("\usepackage{pgfplots}\n")
+    fp.write("\\begin{document}\n")
+    fp.write("% cut from here\n")
+    fp.write("\\begin{tikzpicture}\n")
+    fp.write("\\begin{axis}[xlabel={Run}, ylabel={MFlops/s / MBytes/s},title={%s\_%s},legend pos=south east,xtick=data,width=.75\\textwidth]\n" % (group.replace("_","\_"),test.replace("_","\_"),))
+    fp.write("\\addplot+[red,mark=square*,mark options={draw=red, fill=red}] table {%s};\n" % (os.path.basename(plain_file),))
+    fp.write("\\addlegendentry{plain};\n")
+    fp.write("\\addplot+[blue,mark=diamond*,mark options={draw=blue, fill=blue}] table {%s};\n" % (os.path.basename(marker_file),))
+    fp.write("\\addlegendentry{marker};\n")
+    if papi and papi_file:
+        fp.write("\\addplot+[green,mark=triangle*,mark options={draw=green, fill=green}] table {%s};\n" % (os.path.basename(papi_file),))
+        fp.write("\\addlegendentry{papi};\n")
+    fp.write("\\end{axis}\n")
+    fp.write("\\end{tikzpicture}\n")
+    fp.write("% stop cutting here\n")
+    fp.write("\\end{document}\n")
+    fp.close()
+    if execute:
+        cmd = "cd %s && pdflatex %s && cd -" % (os.path.dirname(filename), os.path.basename(filename),)
+        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        p.wait()
+        if p.returncode != 0:
+            print p.stdout.read()
+        p.stdout.close()
+    if script:
+        script.write("pdflatex %s\n" % (os.path.basename(filename),))
+    return filename
+    
+def write_gnuplot(group, test, plain_file, marker_file, papi_file, execute=False, script=None):
+    filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".plot")
+    fp = open(filename,'w')
+    for i,color in enumerate(gnu_colors):
+        fp.write("set style line %d linetype 1 linecolor rgb '%s' lw 2 pt %s\n" % (i+1, color,gnu_marks[i]))
+    fp.write("set terminal jpeg\n")
+    fp.write("set title '%s_%s'\n" % (group, test,))
+    fp.write("set output '%s'\n" % (os.path.basename(os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".jpg")),))
+    fp.write("set xlabel 'Run'\n")
+    fp.write("set ylabel 'MFlops/s / MBytes/s'\n")
+    #fp.write("set xtics 1\n")
+    plot_string = "plot '%s' using 1:2 title 'plain' with linespoints ls 1, \\\n '%s' using 1:2 title 'marker' with linespoints ls 2" % (os.path.basename(plain_file), os.path.basename(marker_file),)
+    if papi and papi_file:
+        plot_string += ", \\\n '%s' using 1:2 title 'papi' with linespoints ls 3\n" % (os.path.basename(papi_file),)
+    fp.write(plot_string+"\n")
+    fp.close()
+    if execute:
+        cmd = "cd %s && gnuplot %s && cd -" % (os.path.dirname(filename), os.path.basename(filename),)
+        print cmd
+        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        p.wait()
+        if p.returncode != 0:
+            print p.stdout.read()
+        p.stdout.close()
+    if script:
+        script.write("gnuplot %s\n" % (os.path.basename(filename),))
+    return filename
+
+def write_grace(group, test, plain_file, marker_file, papi_file=None, execute=False, script=None):
+    filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".bat")
+    agrname = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".agr")
+    pngname = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".png")
+    if execute or script:
+        plain_file = os.path.basename(plain_file)
+        marker_file = os.path.basename(marker_file)
+        if papi_file: papi_file = os.path.basename(papi_file)
+        pngname = os.path.basename(pngname)
+        agrname = os.path.basename(agrname)
+    cmd_options = "-autoscale xy -nxy %s -nxy %s "% (plain_file,marker_file,)
+    if papi and papi_file:
+        cmd_options += "-nxy %s " % (papi_file,)
+    out_options = "-hdevice PNG -printfile %s " % (pngname,)
+    out_options += "-saveall %s" % (agrname,)
+    fp = open(filename,'w')
+    fp.write("title \"%s_%s\"\n" % (group, test,))
+    fp.write("xaxis label \"Run\"\n")
+    fp.write("xaxis label char size 1.2\n")
+    fp.write("xaxis ticklabel char size 1.2\n")
+    fp.write("yaxis label \"MFlops/s / MBytes/s\"\n")
+    fp.write("yaxis label char size 1.2\n")
+    fp.write("yaxis ticklabel char size 1.2\n")
+    fp.write("legend 0.8,0.7\n")
+    fp.write("s0 legend \"plain\"\n")
+    fp.write("s0 symbol 2\n")
+    fp.write("s0 symbol size 1\n")
+    fp.write("s0 symbol color 2\n")
+    fp.write("s0 symbol pattern 1\n")
+    fp.write("s0 symbol fill color 2\n")
+    fp.write("s0 symbol fill pattern 1\n")
+    fp.write("s0 symbol linewidth 2\n")
+    fp.write("s0 symbol linestyle 1\n")
+    fp.write("s0 line type 1\n")
+    fp.write("s0 line color 2\n")
+    fp.write("s0 line linestyle 1\n")
+    fp.write("s0 line linewidth 2\n")
+    fp.write("s0 line pattern 1\n")
+    fp.write("s1 legend \"marker\"\n")
+    fp.write("s1 symbol 3\n")
+    fp.write("s1 symbol size 1\n")
+    fp.write("s1 symbol color 4\n")
+    fp.write("s1 symbol pattern 1\n")
+    fp.write("s1 symbol fill color 4\n")
+    fp.write("s1 symbol fill pattern 1\n")
+    fp.write("s1 symbol linewidth 2\n")
+    fp.write("s1 symbol linestyle 1\n")
+    fp.write("s1 line type 1\n")
+    fp.write("s1 line color 4\n")
+    fp.write("s1 line linestyle 1\n")
+    fp.write("s1 line linewidth 2\n")
+    fp.write("s1 line pattern 1\n")
+    if papi and papi_file:
+        fp.write("s2 legend \"papi\"\n")
+        fp.write("s2 symbol 4\n")
+        fp.write("s2 symbol size 1\n")
+        fp.write("s2 symbol color 3\n")
+        fp.write("s2 symbol pattern 1\n")
+        fp.write("s2 symbol fill color 3\n")
+        fp.write("s2 symbol fill pattern 1\n")
+        fp.write("s2 symbol linewidth 2\n")
+        fp.write("s2 symbol linestyle 1\n")
+        fp.write("s2 line type 1\n")
+        fp.write("s2 line color 3\n")
+        fp.write("s2 line linestyle 1\n")
+        fp.write("s2 line linewidth 2\n")
+        fp.write("s2 line pattern 1\n")
+    fp.close()
+    if execute:
+        cmd = "cd %s && gracebat %s -param %s %s && cd -" % (os.path.dirname(filename), cmd_options, os.path.basename(filename),out_options,)
+        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        p.wait()
+        if p.returncode != 0:
+            print p.stdout.read()
+        p.stdout.close()
+    if script:
+        script.write("gracebat %s -param %s %s\n" % (cmd_options, os.path.basename(filename),out_options,))
+    return filename
+
+try:
+    opts, args = getopt.getopt(sys.argv[1:], "hs:", ["help", "sets=","script","scriptname=","wiki","only_wiki","pgf","gnuplot","grace","papi"])
+except getopt.GetoptError as err:
+    print str(err)
+    usage()
+    sys.exit(2)
+
+if len(opts) == 0:
+    usage()
+    sys.exit(1)
+
+for o, a in opts:
+    if o in ("-h","--help"):
+        usage()
+        sys.exit(0)
+    if o == "--wiki":
+        wiki = True
+    if o == "--only_wiki":
+        only_wiki = True
+    if o == "--papi":
+        papi = True
+    if o == "--pgf":
+        out_pgf = True
+    if o == "--gnuplot":
+        out_gnuplot = True
+    if o == "--grace":
+        out_grace = True
+    if o in ("-s","--sets"):
+        sets = a.split(",")
+    if o == "--script":
+        out_script = True
+    if o == "--scriptname":
+        scriptfilename = a
+
+if not os.path.exists(testlist):
+    print "Cannot find file %s containing list of testgroups" % (testlist,)
+    sys.exit(1)
+if not os.path.exists(testfolder):
+    print "Cannot find folder %s containing the testgroups" % (testfolder,)
+    sys.exit(1)
+
+test_set = {}
+plain_set = {}
+marker_set = {}
+papi_set = {}
+fp = open(testlist,'r')
+for line in fp.read().split("\n"):
+    if not line.strip() or line.startswith("#"): continue
+    if os.path.exists("%s/%s.txt" % (testfolder,line.strip(),)):
+        test_set[line.strip()] = {}
+        plain_set[line.strip()] = {}
+        marker_set[line.strip()] = {}
+        papi_set[line.strip()] = {}
+        testfp = open("%s/%s.txt" % (testfolder,line.strip(),),'r')
+        test = None
+        for i,testline in enumerate(testfp.read().split("\n")):
+            if test and not testline.strip(): test = None
+            if testline.startswith("REGEX_BENCH"):
+                test_set[line.strip()]["REGEX_BENCH"] = re.compile(" ".join(testline.split(" ")[1:]))
+            if testline.startswith("REGEX_PERF"):
+                test_set[line.strip()]["REGEX_PERF"] = re.compile(" ".join(testline.split(" ")[1:]))
+            if testline.startswith("REGEX_PAPI"):
+                test_set[line.strip()]["REGEX_PAPI"] = re.compile(" ".join(testline.split(" ")[1:]))
+            if testline.startswith("TEST"):
+                test = testline.split(" ")[1]
+                test_set[line.strip()][test] = {}
+                plain_set[line.strip()][test] = {}
+                marker_set[line.strip()][test] = {}
+                papi_set[line.strip()][test] = {}
+            if testline.startswith("RUNS") and test:
+                test_set[line.strip()][test]["RUNS"] = int(testline.split(" ")[1])
+            if testline.startswith("VARIANT") and test:
+                linelist = re.split("\s+",testline);
+                variant = linelist[1]
+                if not test_set[line.strip()][test].has_key("variants"):
+                    test_set[line.strip()][test]["variants"] = []
+                test_set[line.strip()][test][variant] = linelist[2]
+                test_set[line.strip()][test]["variants"].append(linelist[1])
+                plain_set[line.strip()][test][variant] = []
+                marker_set[line.strip()][test][variant] = []
+                papi_set[line.strip()][test][variant] = []
+        testfp.close()
+fp.close()
+
+
+if len(test_set.keys()) == 0:
+    print "Cannot find any group in %s" % (testlist)
+    sys.exit(1)
+
+if not os.path.exists(resultfolder):
+    os.mkdir(resultfolder)
+if not os.path.exists(os.path.join(resultfolder,hostname)):
+    os.mkdir(os.path.join(resultfolder,hostname))
+
+if not only_wiki:
+    scriptfile = os.path.join(os.path.join(resultfolder,hostname),scriptfilename)
+    script = open(scriptfile,'w')
+    script.write("#!/bin/bash\n")
+
+    for group in test_set.keys():
+        perfctr_string = "%s -c S0:0 -g %s -m " % (perfctr,group,)
+        for test in test_set[group].keys():
+            if test.startswith("REGEX"): continue
+            file_plain = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.dat")
+            raw_plain = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.raw")
+            file_marker = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.dat")
+            raw_marker = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.raw")
+            outfp_plain = open(file_plain,'w')
+            rawfp_plain = open(raw_plain,'w')
+            outfp_marker = open(file_marker,'w')
+            rawfp_marker = open(raw_marker,'w')
+            if papi:
+                file_papi = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_papi.dat")
+                raw_papi = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_papi.raw")
+                outfp_papi = open(file_papi,'w')
+                rawfp_papi = open(raw_papi,'w')
+            else:
+                file_papi = None
+                raw_papi = None
+            counter = 1
+            for size in test_set[group][test]["variants"]:
+                if size.startswith("RUNS"): continue
+                bench_options = "-t %s -i %s -g 1 -w N:%s:1" % (test, test_set[group][test][size], size,)
+                for i in range(0,test_set[group][test]["RUNS"]):
+                    # Run with plain likwid-bench
+                    p = subprocess.Popen(bench_plain+" "+bench_options, shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+                    try:
+                        p.wait()
+                        stdout = p.stdout.read()
+                        p.stdout.close()
+                    except:
+                        sys.exit(1)
+                    for line in stdout.split("\n"):
+                        if p.returncode != 0: print line
+                        match = test_set[group]["REGEX_BENCH"].match(line)
+                        if match:
+                            plain_set[group][test][size].append(match.group(1))
+                            outfp_plain.write(str(counter)+" "+match.group(1)+"\n")
+                        rawfp_plain.write(line+"\n")
+                    # Run with papi instrumented likwid-bench
+                    if papi:
+                        os.environ["PAPI_BENCH"] = str(group)
+                        p = subprocess.Popen(bench_papi+" "+bench_options, shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+                        try:
+                            p.wait()
+                            stdout = p.stdout.read()
+                            p.stdout.close()
+                        except:
+                            sys.exit(1)
+                        for line in stdout.split("\n"):
+                            if p.returncode != 0: print line
+                            match = test_set[group]["REGEX_PAPI"].match(line)
+                            if match:
+                                papi_set[group][test][size].append(match.group(1))
+                                outfp_papi.write(str(counter)+" "+match.group(1)+"\n")
+                            rawfp_papi.write(line+"\n")
+                    # Run with LIKWID instrumented likwid-bench and likwid-perfctr
+                    p = subprocess.Popen(perfctr_string+" "+bench_marker+" "+bench_options, shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+                    stdout = ""
+                    try:
+                        p.wait()
+                        stdout = p.stdout.read()
+                        p.stdout.close()
+                    except:
+                        sys.exit(1)
+                    for line in stdout.split("\n"):
+                        if p.returncode != 0: print line
+                        match = test_set[group]["REGEX_PERF"].match(line)
+                        if match:
+                            marker_set[group][test][size].append(float(match.group(1)))
+                            outfp_marker.write(str(counter)+" "+str(float(match.group(1)))+"\n")
+                        rawfp_marker.write(line+"\n")
+                    counter += 1
+            outfp_plain.close()
+            rawfp_plain.close()
+            outfp_marker.close()
+            rawfp_marker.close()
+            if papi:
+                outfp_papi.close()
+                rawfp_papi.close()
+            if out_pgf: pgf_file = write_pgf(group, test, file_plain, file_marker, file_papi, script=script)
+            if out_gnuplot: plot_file = write_gnuplot(group, test, file_plain, file_marker, file_papi, script=script)
+            if out_grace: grace_file = write_grace(group, test, file_plain, file_marker, file_papi, script=script)
+
+
+    script.close()
+    os.chmod(scriptfile, stat.S_IRWXU)
+#if only_wiki:
+#    for group in test_set.keys():
+#        for test in test_set[group].keys():
+#            if test.startswith("REGEX"): continue
+#            filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.dat")
+#            for i,size in enumerate(test_set[group][test]["variants"]):
+#                start = i*test_set[group][test]["RUNS"]
+#                end = (i+1)*test_set[group][test]["RUNS"]
+#                runs = test_set[group][test]["RUNS"]
+#                print "Read file %s for size %s from %d to %d" % (filename,size, start, end,)
+#                plain_set[group][test][size] = get_values_from_file(filename, start, runs)
+#                if len(plain_set[group][test][size]) == 0: plain_set[group][test][size].append(0)
+#            filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.dat")
+#            for i,size in enumerate(test_set[group][test]["variants"]):
+#                start = i*test_set[group][test]["RUNS"]
+#                end = (i+1)*test_set[group][test]["RUNS"]
+#                runs = test_set[group][test]["RUNS"]
+#                print "Read file %s for size %s from %d to %d" % (filename,size, start, end,)
+#                marker_set[group][test][size] = get_values_from_file(filename, start, runs)
+#                if len(marker_set[group][test][size]) == 0: marker_set[group][test][size].append(0)
+
+
+if wiki or only_wiki:
+    name, sockets, corespersocket, threadspercore = get_system_info();
+    groups = get_groups()
+    testable_groups = get_test_groups(groups)
+    #print groups
+    #print testable_groups
+    #if testable_groups.has_key("FLOPS_DP"): del testable_groups["FLOPS_DP"]
+
+    print "#summary Accuracy Tests for %s\n" % (name,)
+    print "= Hardware description ="
+    print "Sockets: %d<br>" % (sockets,)
+    print "Cores per socket: %d<br>" % (corespersocket,)
+    print "Threads per core: %d<br>" % (threadspercore,)
+    print "Total number of processing units: %d<br>" % (sockets * corespersocket * threadspercore)
+    print
+    print "= Available groups ="
+    print "Each architecture defines a different set of groups. Here all the groups available for the %s are listed:<br>" % (name,)
+    for grp in groups.keys():
+        print "%s: %s<br>" % (grp, groups[grp],)
+    print
+    print "= Available verification tests ="
+    print "Not all groups can be tested for accuracy. Here only the groups are listed that can be verified. Each group is followed by the low-level benchmarks that are performed for comparison.<br>"
+    #print testable_groups
+    for grp in testable_groups.keys():
+        print "%s: %s<br>" % (grp, ", ".join (testable_groups[grp]))
+    print
+    print "= Accuracy comparison ="
+    print "For each varification group, the tests are performed twice. Once in a plain manner without measuring but calculating the resulting values and once through an instumented code with LIKWID.<br>"
+    
+    
+    for grp in testable_groups.keys():
+        print "== Verification of Group %s ==" % (grp,)
+        for test in testable_groups[grp]:
+            #print grp, test, test_set[grp][test]
+            print "=== Verification of Group %s with Test %s ===" % (grp, test,)
+            print "|| *Stream size* || *Iterations* ||"
+            for variant in test_set[grp][test]["variants"]:
+                print "|| %s || %s ||" % (variant, test_set[grp][test][variant], )
+            print 
+            print "Each data size is tested %d times, hence the first %d entries on the x-axis correspond to the %d runs for the first data size of %s and so on.<br>" % (test_set[grp][test]["RUNS"],test_set[grp][test]["RUNS"],test_set[grp][test]["RUNS"],test_set[grp][test]["variants"][0],)
+            print "%s/accuracy/%s/%s_%s.png" % (picture_base,hostname, grp, test,)
+            print
+            file_plain = os.path.join(os.path.join(resultfolder,hostname),grp+"_"+test+"_plain.dat")
+            file_marker = os.path.join(os.path.join(resultfolder,hostname),grp+"_"+test+"_marker.dat")
+            print "|| Variant || Plain (Min) || LIKWID (Min) || Plain (Max) || LIKWID (Max) || Plain (Avg) || LIKWID (Avg) ||"
+            for i, variant in enumerate(test_set[grp][test]["variants"]):
+                results_plain = get_values_from_file(file_plain, i*test_set[grp][test]["RUNS"], test_set[grp][test]["RUNS"])
+                results_marker = get_values_from_file(file_marker, i*test_set[grp][test]["RUNS"], test_set[grp][test]["RUNS"])
+                 if results_plain == []: results_plain.append(0)
+                 if results_marker == []: results_marker.append(0)
+                 print "|| %s || %d || %d || %d || %d || %d || %d ||" % (variant, min(results_plain), min(results_marker), max(results_plain), max(results_marker), int(statistics.mean(results_plain)), int(statistics.mean(results_marker)),)
+            print
+            print
diff --git a/test/accuracy/likwid-tester b/test/accuracy/likwid-tester
index 286b759..ea264ae 100755
--- a/test/accuracy/likwid-tester
+++ b/test/accuracy/likwid-tester
@@ -127,10 +127,12 @@ foreach my $test ( keys %$TESTS ) {
                 foreach ( 0 ... $runs ) {
                     print DATAFILE1 "$globalrun ";
                     print DATAFILE2 "$globalrun ";
+                    #print "$BENCH_PLAIN -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt";
                     system ("$BENCH_PLAIN -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt");
                     my $result = extract_result('plain',$TESTS->{$test}->{REGEX_BENCH},$TESTS->{$test}->{REGEX_PERF});
                     print DATAFILE1 "$result\n";
-                    system ("$PERFCTR  -c". $domain .":0 -m -g $test $BENCH_MARKER -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt");
+                    #print "$PERFCTR  -C E:". $domain .":0 -m -g $test $BENCH_MARKER -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt";
+                    system ("$PERFCTR  -C E:". $domain .":1 -m -g $test $BENCH_MARKER -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt");
                     $result = extract_result('marker',$TESTS->{$test}->{REGEX_BENCH},$TESTS->{$test}->{REGEX_PERF});
                     print DATAFILE2 "$result\n";
                     $globalrun++;
diff --git a/test/accuracy/statistics.py b/test/accuracy/statistics.py
new file mode 100755
index 0000000..15dfdf2
--- /dev/null
+++ b/test/accuracy/statistics.py
@@ -0,0 +1,643 @@
+##  Module statistics.py
+##
+##  Copyright (c) 2013 Steven D'Aprano <steve+python at pearwood.info>.
+##
+##  Licensed under the Apache License, Version 2.0 (the "License");
+##  you may not use this file except in compliance with the License.
+##  You may obtain a copy of the License at
+##
+##  http://www.apache.org/licenses/LICENSE-2.0
+##
+##  Unless required by applicable law or agreed to in writing, software
+##  distributed under the License is distributed on an "AS IS" BASIS,
+##  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+##  See the License for the specific language governing permissions and
+##  limitations under the License.
+
+
+"""
+Basic statistics module.
+
+This module provides functions for calculating statistics of data, including
+averages, variance, and standard deviation.
+
+Calculating averages
+--------------------
+
+==================  =============================================
+Function            Description
+==================  =============================================
+mean                Arithmetic mean (average) of data.
+median              Median (middle value) of data.
+median_low          Low median of data.
+median_high         High median of data.
+median_grouped      Median, or 50th percentile, of grouped data.
+mode                Mode (most common value) of data.
+==================  =============================================
+
+Calculate the arithmetic mean ("the average") of data:
+
+>>> mean([-1.0, 2.5, 3.25, 5.75])
+2.625
+
+
+Calculate the standard median of discrete data:
+
+>>> median([2, 3, 4, 5])
+3.5
+
+
+Calculate the median, or 50th percentile, of data grouped into class intervals
+centred on the data values provided. E.g. if your data points are rounded to
+the nearest whole number:
+
+>>> median_grouped([2, 2, 3, 3, 3, 4])  #doctest: +ELLIPSIS
+2.8333333333...
+
+This should be interpreted in this way: you have two data points in the class
+interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
+the class interval 3.5-4.5. The median of these data points is 2.8333...
+
+
+Calculating variability or spread
+---------------------------------
+
+==================  =============================================
+Function            Description
+==================  =============================================
+pvariance           Population variance of data.
+variance            Sample variance of data.
+pstdev              Population standard deviation of data.
+stdev               Sample standard deviation of data.
+==================  =============================================
+
+Calculate the standard deviation of sample data:
+
+>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75])  #doctest: +ELLIPSIS
+4.38961843444...
+
+If you have previously calculated the mean, you can pass it as the optional
+second argument to the four "spread" functions to avoid recalculating it:
+
+>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
+>>> mu = mean(data)
+>>> pvariance(data, mu)
+2.5
+
+
+Exceptions
+----------
+
+A single exception is defined: StatisticsError is a subclass of ValueError.
+
+"""
+
+__all__ = [ 'StatisticsError',
+            'pstdev', 'pvariance', 'stdev', 'variance',
+            'median',  'median_low', 'median_high', 'median_grouped',
+            'mean', 'mode',
+          ]
+
+
+import collections
+import math
+
+from fractions import Fraction
+from decimal import Decimal
+
+
+# === Exceptions ===
+
+class StatisticsError(ValueError):
+    pass
+
+
+# === Private utilities ===
+
+def _sum(data, start=0):
+    """_sum(data [, start]) -> value
+
+    Return a high-precision sum of the given numeric data. If optional
+    argument ``start`` is given, it is added to the total. If ``data`` is
+    empty, ``start`` (defaulting to 0) is returned.
+
+
+    Examples
+    --------
+
+    >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
+    11.0
+
+    Some sources of round-off error will be avoided:
+
+    >>> _sum([1e50, 1, -1e50] * 1000)  # Built-in sum returns zero.
+    1000.0
+
+    Fractions and Decimals are also supported:
+
+    >>> from fractions import Fraction as F
+    >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
+    Fraction(63, 20)
+
+    >>> from decimal import Decimal as D
+    >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
+    >>> _sum(data)
+    Decimal('0.6963')
+
+    Mixed types are currently treated as an error, except that int is
+    allowed.
+    """
+    # We fail as soon as we reach a value that is not an int or the type of
+    # the first value which is not an int. E.g. _sum([int, int, float, int])
+    # is okay, but sum([int, int, float, Fraction]) is not.
+    allowed_types = set([int, type(start)])
+    n, d = _exact_ratio(start)
+    partials = {d: n}  # map {denominator: sum of numerators}
+    # Micro-optimizations.
+    exact_ratio = _exact_ratio
+    partials_get = partials.get
+    # Add numerators for each denominator.
+    for x in data:
+        _check_type(type(x), allowed_types)
+        n, d = exact_ratio(x)
+        partials[d] = partials_get(d, 0) + n
+    # Find the expected result type. If allowed_types has only one item, it
+    # will be int; if it has two, use the one which isn't int.
+    assert len(allowed_types) in (1, 2)
+    if len(allowed_types) == 1:
+        assert allowed_types.pop() is int
+        T = int
+    else:
+        T = (allowed_types - set([int])).pop()
+    if None in partials:
+        assert issubclass(T, (float, Decimal))
+        assert not math.isfinite(partials[None])
+        return T(partials[None])
+    total = Fraction()
+    for d, n in sorted(partials.items()):
+        total += Fraction(n, d)
+    if issubclass(T, int):
+        assert total.denominator == 1
+        return T(total.numerator)
+    if issubclass(T, Decimal):
+        return T(total.numerator)/total.denominator
+    return T(total)
+
+
+def _check_type(T, allowed):
+    if T not in allowed:
+        if len(allowed) == 1:
+            allowed.add(T)
+        else:
+            types = ', '.join([t.__name__ for t in allowed] + [T.__name__])
+            raise TypeError("unsupported mixed types: %s" % types)
+
+
+def _exact_ratio(x):
+    """Convert Real number x exactly to (numerator, denominator) pair.
+
+    >>> _exact_ratio(0.25)
+    (1, 4)
+
+    x is expected to be an int, Fraction, Decimal or float.
+    """
+    try:
+        try:
+            # int, Fraction
+            return (x.numerator, x.denominator)
+        except AttributeError:
+            # float
+            try:
+                return x.as_integer_ratio()
+            except AttributeError:
+                # Decimal
+                try:
+                    return _decimal_to_ratio(x)
+                except AttributeError:
+                    msg = "can't convert type '{}' to numerator/denominator"
+                    exc = TypeError(msg.format(type(x).__name__))
+                    exc.__cause__ = None
+                    raise exc
+    except (OverflowError, ValueError):
+        # INF or NAN
+        if __debug__:
+            # Decimal signalling NANs cannot be converted to float :-(
+            if isinstance(x, Decimal):
+                assert not x.is_finite()
+            else:
+                assert not math.isfinite(x)
+        return (x, None)
+
+
+# FIXME This is faster than Fraction.from_decimal, but still too slow.
+def _decimal_to_ratio(d):
+    """Convert Decimal d to exact integer ratio (numerator, denominator).
+
+    >>> from decimal import Decimal
+    >>> _decimal_to_ratio(Decimal("2.6"))
+    (26, 10)
+
+    """
+    sign, digits, exp = d.as_tuple()
+    if exp in ('F', 'n', 'N'):  # INF, NAN, sNAN
+        assert not d.is_finite()
+        raise ValueError
+    num = 0
+    for digit in digits:
+        num = num*10 + digit
+    if exp < 0:
+        den = 10**-exp
+    else:
+        num *= 10**exp
+        den = 1
+    if sign:
+        num = -num
+    return (num, den)
+
+
+def _counts(data):
+    # Generate a table of sorted (value, frequency) pairs.
+    table = collections.Counter(iter(data)).most_common()
+    if not table:
+        return table
+    # Extract the values with the highest frequency.
+    maxfreq = table[0][1]
+    for i in range(1, len(table)):
+        if table[i][1] != maxfreq:
+            table = table[:i]
+            break
+    return table
+
+
+# === Measures of central tendency (averages) ===
+
+def mean(data):
+    """Return the sample arithmetic mean of data.
+
+    >>> mean([1, 2, 3, 4, 4])
+    2.8
+
+    >>> from fractions import Fraction as F
+    >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
+    Fraction(13, 21)
+
+    >>> from decimal import Decimal as D
+    >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
+    Decimal('0.5625')
+
+    If ``data`` is empty, StatisticsError will be raised.
+    """
+    if iter(data) is data:
+        data = list(data)
+    n = len(data)
+    if n < 1:
+        raise StatisticsError('mean requires at least one data point')
+    only_int = True
+    for item in data:
+        if not type(item) is int: 
+            only_int = False
+            break
+    if (only_int): return _sum(data,0.0)/n
+    else:return _sum(data)/n
+
+def sort_and_convert(data):
+    newdata = []
+    for i in data: newdata.append(float(i))
+    return sorted(newdata)
+
+# FIXME: investigate ways to calculate medians without sorting? Quickselect?
+def median(data):
+    """Return the median (middle value) of numeric data.
+
+    When the number of data points is odd, return the middle data point.
+    When the number of data points is even, the median is interpolated by
+    taking the average of the two middle values:
+
+    >>> median([1, 3, 5])
+    3
+    >>> median([1, 3, 5, 7])
+    4.0
+
+    """
+    data = sorted(data)
+    n = len(data)
+    if n == 0:
+        raise StatisticsError("no median for empty data")
+    if n%2 == 1:
+        return data[n//2]
+    else:
+        i = n//2
+        return (float(data[i - 1]) + data[i])/2
+
+
+def median_low(data):
+    """Return the low median of numeric data.
+
+    When the number of data points is odd, the middle value is returned.
+    When it is even, the smaller of the two middle values is returned.
+
+    >>> median_low([1, 3, 5])
+    3
+    >>> median_low([1, 3, 5, 7])
+    3
+
+    """
+    data = sorted(data)
+    n = len(data)
+    if n == 0:
+        raise StatisticsError("no median for empty data")
+    if n%2 == 1:
+        return data[n//2]
+    else:
+        return data[n//2 - 1]
+
+
+def median_high(data):
+    """Return the high median of data.
+
+    When the number of data points is odd, the middle value is returned.
+    When it is even, the larger of the two middle values is returned.
+
+    >>> median_high([1, 3, 5])
+    3
+    >>> median_high([1, 3, 5, 7])
+    5
+
+    """
+    data = sorted(data)
+    n = len(data)
+    if n == 0:
+        raise StatisticsError("no median for empty data")
+    return data[n//2]
+
+
+def median_grouped(data, interval=1):
+    """"Return the 50th percentile (median) of grouped continuous data.
+
+    >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
+    3.7
+    >>> median_grouped([52, 52, 53, 54])
+    52.5
+
+    This calculates the median as the 50th percentile, and should be
+    used when your data is continuous and grouped. In the above example,
+    the values 1, 2, 3, etc. actually represent the midpoint of classes
+    0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
+    class 3.5-4.5, and interpolation is used to estimate it.
+
+    Optional argument ``interval`` represents the class interval, and
+    defaults to 1. Changing the class interval naturally will change the
+    interpolated 50th percentile value:
+
+    >>> median_grouped([1, 3, 3, 5, 7], interval=1)
+    3.25
+    >>> median_grouped([1, 3, 3, 5, 7], interval=2)
+    3.5
+
+    This function does not check whether the data points are at least
+    ``interval`` apart.
+    """
+    data = sorted(data)
+    n = len(data)
+    if n == 0:
+        raise StatisticsError("no median for empty data")
+    elif n == 1:
+        return data[0]
+    # Find the value at the midpoint. Remember this corresponds to the
+    # centre of the class interval.
+    x = data[n//2]
+    for obj in (x, interval):
+        if isinstance(obj, (str, bytes)):
+            raise TypeError('expected number but got %r' % obj)
+    try:
+        L = x - interval/2  # The lower limit of the median interval.
+    except TypeError:
+        # Mixed type. For now we just coerce to float.
+        L = float(x) - float(interval)/2
+    print L
+    cf = data.index(x)  # Number of values below the median interval.
+    print cf
+    # FIXME The following line could be more efficient for big lists.
+    f = data.count(x)  # Number of data points in the median interval.
+    print f
+    return L + interval*(n/2 - cf)/f
+
+
+def mode(data):
+    """Return the most common data point from discrete or nominal data.
+
+    ``mode`` assumes discrete data, and returns a single value. This is the
+    standard treatment of the mode as commonly taught in schools:
+
+    >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
+    3
+
+    This also works with nominal (non-numeric) data:
+
+    >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
+    'red'
+
+    If there is not exactly one most common value, ``mode`` will raise
+    StatisticsError.
+    """
+    # Generate a table of sorted (value, frequency) pairs.
+    table = _counts(data)
+    if len(table) == 1:
+        return table[0][0]
+    elif table:
+        raise StatisticsError(
+                'no unique mode; found %d equally common values' % len(table)
+                )
+    else:
+        raise StatisticsError('no mode for empty data')
+
+
+# === Measures of spread ===
+
+# See http://mathworld.wolfram.com/Variance.html
+#     http://mathworld.wolfram.com/SampleVariance.html
+#     http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+#
+# Under no circumstances use the so-called "computational formula for
+# variance", as that is only suitable for hand calculations with a small
+# amount of low-precision data. It has terrible numeric properties.
+#
+# See a comparison of three computational methods here:
+# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
+
+def _ss(data, c=None):
+    """Return sum of square deviations of sequence data.
+
+    If ``c`` is None, the mean is calculated in one pass, and the deviations
+    from the mean are calculated in a second pass. Otherwise, deviations are
+    calculated from ``c`` as given. Use the second case with care, as it can
+    lead to garbage results.
+    """
+    if c is None:
+        c = mean(data)
+    ss = _sum((x-c)**2 for x in data)
+    # The following sum should mathematically equal zero, but due to rounding
+    # error may not.
+    ss -= _sum((x-c) for x in data)**2/len(data)
+    assert not ss < 0, 'negative sum of square deviations: %f' % ss
+    return ss
+
+
+def variance(data, xbar=None):
+    """Return the sample variance of data.
+
+    data should be an iterable of Real-valued numbers, with at least two
+    values. The optional argument xbar, if given, should be the mean of
+    the data. If it is missing or None, the mean is automatically calculated.
+
+    Use this function when your data is a sample from a population. To
+    calculate the variance from the entire population, see ``pvariance``.
+
+    Examples:
+
+    >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
+    >>> variance(data)
+    1.3720238095238095
+
+    If you have already calculated the mean of your data, you can pass it as
+    the optional second argument ``xbar`` to avoid recalculating it:
+
+    >>> m = mean(data)
+    >>> variance(data, m)
+    1.3720238095238095
+
+    This function does not check that ``xbar`` is actually the mean of
+    ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
+    impossible results.
+
+    Decimals and Fractions are supported:
+
+    >>> from decimal import Decimal as D
+    >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
+    Decimal('31.01875')
+
+    >>> from fractions import Fraction as F
+    >>> variance([F(1, 6), F(1, 2), F(5, 3)])
+    Fraction(67, 108)
+
+    """
+    if iter(data) is data:
+        data = list(data)
+    n = len(data)
+    if n < 2:
+        raise StatisticsError('variance requires at least two data points')
+    ss = _ss(data, xbar)
+    return ss/(n-1)
+
+
+def pvariance(data, mu=None):
+    """Return the population variance of ``data``.
+
+    data should be an iterable of Real-valued numbers, with at least one
+    value. The optional argument mu, if given, should be the mean of
+    the data. If it is missing or None, the mean is automatically calculated.
+
+    Use this function to calculate the variance from the entire population.
+    To estimate the variance from a sample, the ``variance`` function is
+    usually a better choice.
+
+    Examples:
+
+    >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
+    >>> pvariance(data)
+    1.25
+
+    If you have already calculated the mean of the data, you can pass it as
+    the optional second argument to avoid recalculating it:
+
+    >>> mu = mean(data)
+    >>> pvariance(data, mu)
+    1.25
+
+    This function does not check that ``mu`` is actually the mean of ``data``.
+    Giving arbitrary values for ``mu`` may lead to invalid or impossible
+    results.
+
+    Decimals and Fractions are supported:
+
+    >>> from decimal import Decimal as D
+    >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
+    Decimal('24.815')
+
+    >>> from fractions import Fraction as F
+    >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
+    Fraction(13, 72)
+
+    """
+    if iter(data) is data:
+        data = list(data)
+    n = len(data)
+    if n < 1:
+        raise StatisticsError('pvariance requires at least one data point')
+    ss = _ss(data, mu)
+    return ss/n
+
+
+def stdev(data, xbar=None):
+    """Return the square root of the sample variance.
+
+    See ``variance`` for arguments and other details.
+
+    >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
+    1.0810874155219827
+
+    """
+    var = variance(data, xbar)
+    try:
+        return var.sqrt()
+    except AttributeError:
+        return math.sqrt(var)
+
+
+def pstdev(data, mu=None):
+    """Return the square root of the population variance.
+
+    See ``pvariance`` for arguments and other details.
+
+    >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
+    0.986893273527251
+
+    """
+    var = pvariance(data, mu)
+    try:
+        return var.sqrt()
+    except AttributeError:
+        return math.sqrt(var)
+
+def percentile(data, percentile):
+	sorted_data = sorted(data)
+	if percentile > 1:
+		percentile /= 100.0
+	index = int(len(data)*percentile)
+	if (index == 0): index = 1
+	elif (index == len(data)): index = len(data)-1
+	return sorted_data[index]
+
+def percentile_10(data):
+	return percentile(data,0.1)
+def percentile_20(data):
+	return percentile(data,0.2)
+def percentile_25(data):
+	return percentile(data,0.25)
+def percentile_30(data):
+	return percentile(data,0.3)
+def percentile_40(data):
+	return percentile(data,0.4)
+def percentile_50(data):
+	return percentile(data,0.5)
+def percentile_60(data):
+	return percentile(data,0.6)
+def percentile_70(data):
+	return percentile(data,0.7)
+def percentile_75(data):
+	return percentile(data,0.75)
+def percentile_80(data):
+	return percentile(data,0.8)
+def percentile_90(data):
+	return percentile(data,0.9)
+
diff --git a/test/executable_tests/Makefile b/test/executable_tests/Makefile
new file mode 100644
index 0000000..08acc2a
--- /dev/null
+++ b/test/executable_tests/Makefile
@@ -0,0 +1,22 @@
+
+
+all: topology pin perfctr memsweeper powermeter features bench genCfg setFreq
+
+topology:
+	./tester.sh likwid-topology
+pin:
+	./tester.sh likwid-pin
+perfctr:
+	./tester.sh likwid-perfctr
+memsweeper:
+	./tester.sh likwid-memsweeper
+powermeter:
+	./tester.sh likwid-powermeter
+features:
+	./tester.sh likwid-features
+bench:
+	./tester.sh likwid-bench
+genCfg:
+	./tester.sh likwid-genCfg
+setFreq:
+	./tester.sh likwid-setFreq
diff --git a/test/executable_tests/README b/test/executable_tests/README
new file mode 100644
index 0000000..99ab560
--- /dev/null
+++ b/test/executable_tests/README
@@ -0,0 +1,8 @@
+Simple commandline argument evaluation tool
+
+Usage: ./tester.sh <executable>
+
+For batch testing all executables simply type make
+
+All lines in the <executable>.txt file are executed and the output evaluated.
+Only simple checks are made using grep.
diff --git a/test/executable_tests/likwid-bench.txt b/test/executable_tests/likwid-bench.txt
new file mode 100644
index 0000000..474b160
--- /dev/null
+++ b/test/executable_tests/likwid-bench.txt
@@ -0,0 +1,29 @@
+| EXIT 0 | GREP Help message
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-bench
+-p | EXIT 0 | GREP Domain
+-a | EXIT 0 | GREP sum
+-i | EXIT 1 | GREP requires an argument
+-i 0 | EXIT 1 | GREP Iterations must be greater than 0
+-i 100 | EXIT 1 | GREP Number of Workgroups must be 1 or greater
+-l | EXIT 1 | GREP requires an argument
+-l sum | EXIT 0 | GREP Name: sum
+-l XXX | EXIT 0 | GREP Unknown test case XXX
+-t | EXIT 1 | GREP requires an argument
+-t sum | EXIT 1 | GREP Number of Workgroups must be 1 or greater
+-t XXX | EXIT 1 | GREP Number of Workgroups must be 1 or greater
+-g | EXIT 1 | GREP requires an argument
+-g 0 | EXIT 1 | GREP Number of Workgroups must be 1 or greater
+-g 1 | EXIT 1 | GREP workgroups requested but only 0 given on commandline
+-g X | EXIT 1 | GREP Number of Workgroups must be 1 or greater
+-w | EXIT 1 | GREP requires an argument
+-g 1 -w X | EXIT 1 | GREP You need to specify a test case first
+-t sum -g 1 -w X | EXIT 1 | GREP Error in parsing workgroup string
+-t sum -g 1 -w N:1 | EXIT 1 | GREP Cannot parse string
+-t XXX -g 1 -w N:1MB:1 | EXIT 1 | GREP You need to specify a test case first
+-g 1 -w N:100kB:1 | EXIT 1 | GREP You need to specify a test case first
+-i 100 -t sum -g 1 -w N:100kB:1 | EXIT 0 | GREP Number of Flops
+-i 100 -t sum -g 2 -w N:100kB:1 | EXIT 1 | GREP workgroups requested but only 1 given on commandline
+-i 100 -t sum -g 2 -w N:100kB:1 -w N:100kB:1 | EXIT 0 | GREP Number of Flops
+-i 100 -t sum -g 1 -w N:100kB:2:1 | EXIT 1 | GREP Error in parsing workgroup string
+-i 100 -t sum -g 1 -w N:100kB:2:1:2 | EXIT 0 | GREP Number of Flops
diff --git a/test/executable_tests/likwid-features.txt b/test/executable_tests/likwid-features.txt
new file mode 100644
index 0000000..ce95592
--- /dev/null
+++ b/test/executable_tests/likwid-features.txt
@@ -0,0 +1,9 @@
+| EXIT 0 | GREP Performance monitoring | GREP CPU core id
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-features
+-c | EXIT 1 | GREP option requires an argument
+-s | EXIT 1 | GREP option requires an argument
+-u | EXIT 1 | GREP option requires an argument
+-c 0 | EXIT 0 | GREP Performance monitoring | GREP CPU core id
+-s HW_PREFETCHER | EXIT 0 | GREP Performance monitoring | GREP CPU core id
+-u HW_PREFETCHER | EXIT 0 | GREP Performance monitoring | GREP CPU core id
diff --git a/test/executable_tests/likwid-genCfg.txt b/test/executable_tests/likwid-genCfg.txt
new file mode 100644
index 0000000..6369b70
--- /dev/null
+++ b/test/executable_tests/likwid-genCfg.txt
@@ -0,0 +1,5 @@
+| EXIT 1 | GREP Permission denied
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-genCfg
+-o | EXIT 1 | GREP option requires an argument
+-o /tmp/topo.txt | EXIT 0 | GREP CPU name
diff --git a/test/executable_tests/likwid-memsweeper.txt b/test/executable_tests/likwid-memsweeper.txt
new file mode 100644
index 0000000..6c4cd0e
--- /dev/null
+++ b/test/executable_tests/likwid-memsweeper.txt
@@ -0,0 +1,8 @@
+| EXIT 0 | GREP Sweeping domain
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-memsweeper
+-c | EXIT 1 | GREP option requires an argument
+-c - | EXIT 1 | GREP Cannot parse string
+-c -1 | EXIT 0 | GREP Sweeping domain
+-c 0 | EXIT 0 | GREP Sweeping domain
+-c 10 | EXIT 1 | GREP ERROR | GREP numa
diff --git a/test/executable_tests/likwid-perfctr.txt b/test/executable_tests/likwid-perfctr.txt
new file mode 100644
index 0000000..80ac60d
--- /dev/null
+++ b/test/executable_tests/likwid-perfctr.txt
@@ -0,0 +1,38 @@
+| EXIT 0 | GREP Help message
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-perfctr
+-i | EXIT 0 | GREP CPU family
+-V -c 0 hostname | EXIT 0 | GREP NOTICE
+-V | EXIT 1 | GREP You must specify at least one processor
+-g | EXIT 1 | GREP option requires an argument
+-g BRANCH -H | EXIT 0 | GREP Group BRANCH:
+-a | EXIT 0 | GREP Available groups
+-V -e | EXIT 0 | GREP This architecture
+-t 200ms | EXIT 1 | GREP You must specify at least one processor
+-c | EXIT 1 | GREP option requires an argument
+-c 0 | EXIT 1 | GREP You have to specify a program to measure as argument
+-t 200ms -c 0 | EXIT 1 | GREP Executable must be given on commandline
+-S | EXIT 1 | GREP option requires an argument
+-o | EXIT 1 | GREP option requires an argument
+-o /tmp/test | EXIT 1 | GREP Outputfile has no filetype suffix
+-o /tmp/test.txt | EXIT 1 | GREP You must specify at least one processor
+-S 1 | EXIT 1 | GREP You must specify at least one processor
+-S 1 -c 0 | EXIT 1 | GREP You have to specify a group or event set to measure using the -g option.
+-S 1 -C 0 | EXIT 1 | GREP You have to specify a group or event set to measure using the -g option.
+-S 1 -c 0 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP Branch
+-S 1 -C 0 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP Branch
+-S 1 -c 0,1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1 -c 0-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1 -c 0,1-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1 -C 0,1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1 -C 0-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1 -C 0,1-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1 -c E:N:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
+-S 1 -c E:N:2:1:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
+-S 1 -c M:scatter -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
+-S 1 -C E:N:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
+-S 1 -C E:N:2:1:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
+-S 1 -C M:scatter -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
+-c 0 -g BRANCH hostname | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
+-C 0 -g BRANCH hostname | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
+-C 0 -g BRANCH -m hostname | EXIT 1 | GREP  The marker result file could not be found
diff --git a/test/executable_tests/likwid-pin.txt b/test/executable_tests/likwid-pin.txt
new file mode 100644
index 0000000..801f79c
--- /dev/null
+++ b/test/executable_tests/likwid-pin.txt
@@ -0,0 +1,26 @@
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-pin
+-i | EXIT 1 | GREP Executable must be given on commandline
+-i hostname | EXIT 0 | GREP Set mem_policy to interleaved
+-S | EXIT 1 |GREP Executable must be given on commandline
+-S hostname | EXIT 0 | GREP Sweeping memory
+-c | EXIT 1 |GREP option requires an argument
+-p | EXIT 0 | GREP Domain | GREP Tag
+-c 0 | EXIT 1 | GREP Executable must be given on commandline
+-c 0 -p | EXIT 0 | GREP 0
+-c N:0 -p | EXIT 0 | GREP 0
+-c S0:0-1 -p | EXIT 0 | GREP 0,1
+-c N:0 at N:1 -p | EXIT 0 | GREP 0,1
+-c N:0 at N:1 at N:2 -p | EXIT 0 | GREP 0,1,2
+-c C0:1-0 -p | EXIT 1 | GREP Range End
+-c E:N:1 -p | EXIT 0 | GREP 0
+-c E:N:2 -p | EXIT 0 | LISTLEN , 2
+-c E:N:2:1:2 -p | EXIT 0 | LISTLEN , 2
+-c E:N:2:1:2 -d . -p | EXIT 0 | LISTLEN . 2
+-c M:scatter -p | EXIT 0
+-s | EXIT 1 | GREP option requires an argument
+-s 0x1 | EXIT 1 | GREP Executable must be given on commandline
+-s 0x1 hostname | EXIT 0 | GREP Main PID
+-q | EXIT 1 | GREP Executable must be given on commandline
+-q hostname | EXIT 1 | NGREP Main PID
+
diff --git a/test/executable_tests/likwid-powermeter.txt b/test/executable_tests/likwid-powermeter.txt
new file mode 100644
index 0000000..f733b06
--- /dev/null
+++ b/test/executable_tests/likwid-powermeter.txt
@@ -0,0 +1,14 @@
+| EXIT 0 | GREP Help message
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-powermeter
+-i | EXIT 0 | GREP Base clock | GREP Power
+-c | EXIT 1 | GREP option requires an argument | GREP Help message
+-s | EXIT 1 | GREP option requires an argument | GREP Help message
+-M | EXIT 1 | GREP option requires an argument | GREP Help message
+-s 1 | EXIT 0 | GREP consumed
+-c 0 | EXIT 1 | GREP Commandline option -c requires an executable if not used in combination with -s
+-p | EXIT 1 | GREP Commandline option -p requires an executable
+-c 0 -s 1 | EXIT 0 | GREP consumed | GREP Socket 0
+-p hostname | EXIT 0 | Measuring group CLOCK
+-c 0 hostname | EXIT 0 | GREP consumed | GREP Socket 0
+-M 1 | EXIT 1 | GREP Either -s <seconds> or executable must be given on commandline
diff --git a/test/executable_tests/likwid-setFreq.txt b/test/executable_tests/likwid-setFreq.txt
new file mode 100644
index 0000000..56c495b
--- /dev/null
+++ b/test/executable_tests/likwid-setFreq.txt
@@ -0,0 +1,6 @@
+| EXIT 1 | GREP Usage
+0 | EXIT 1 | GREP Usage
+0 0 | EXIT 1 | GREP Frequency must be greater than 0
+0 -1 | EXIT 1 | GREP Frequency must be greater than 0
+-1 -1 | EXIT 1 | GREP not a valid CPU ID. Range from 0 to
+100 0 | EXIT 1 | GREP not a valid CPU ID. Range from 0 to
diff --git a/test/executable_tests/likwid-topology.txt b/test/executable_tests/likwid-topology.txt
new file mode 100644
index 0000000..810b1e9
--- /dev/null
+++ b/test/executable_tests/likwid-topology.txt
@@ -0,0 +1,11 @@
+-h | EXIT 0 | Help message
+-v | EXIT 0 | GREP likwid-topology
+-c | EXIT 0 | GREP Cache line size
+-C | EXIT 0 | GREP CPU clock
+-g | EXIT 0 | GREP +--------
+-g -v | EXIT 0 | GREP likwid-topology
+-c -g | EXIT 0 | GREP +-------- | GREP Cache line size
+-c -g -C | EXIT 0 | GREP +-------- | GREP Cache line size | GREP CPU clock
+-o | EXIT 1
+-o /tmp/out | EXIT 1 | GREP filter suffix
+-o /tmp/out.txt | EXIT 0
diff --git a/test/executable_tests/tester.sh b/test/executable_tests/tester.sh
new file mode 100755
index 0000000..71342df
--- /dev/null
+++ b/test/executable_tests/tester.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+if [ $# -ne 1 ]; then
+    echo "You need to give application to test on commandline"
+    exit 1
+fi
+
+EXECPATH=../..
+EXEC=$1
+TMPFILE=/tmp/testout
+
+f_grep() {
+    ARG="$1"
+    if [ `grep "${ARG}" ${TMPFILE} | wc -l` == "0" ]; then
+        return 1
+    fi
+    return 0
+}
+
+f_ngrep() {
+    ARG="$1"
+    if [ `grep "${ARG}" ${TMPFILE} | wc -l` != "0" ]; then
+        return 1
+    fi
+    return 0
+}
+
+f_listlen() {
+    LIST=$(cat ${TMPFILE})
+    DELIM=$(echo ${1} | cut -d ' ' -f 1)
+    COUNT=$(echo ${1} | cut -d ' ' -f 2)
+    CHARS=${LIST//[^${DELIM}]}
+    LENGTH=$(expr ${#CHARS} + 1)
+    if [ ${LENGTH} != "${COUNT}" ]; then
+        return 1
+    fi
+    return 0
+}
+
+if [ ! -e ${EXEC}.txt ]; then
+    echo "Cannot find testfile ${EXEC}.txt"
+    exit 1
+fi
+
+while read -r LINE || [[ -n $LINE ]]; do
+    if [ -z "${LINE}" ]; then continue; fi
+    if [[ "${LINE}" =~ \#.* ]]; then continue; fi
+
+    OPTIONS=$(echo "${LINE}" | cut -d '|' -f 1)
+    RESULTS=$(echo "${LINE}" | cut -d '|' -f 2-)
+    NUM_RESULTS="${RESULTS//[^|]}"
+    EXITCODE=$(${EXECPATH}/${EXEC} ${OPTIONS} 1>${TMPFILE} 2>&1  ; echo $?)
+    STATE=0
+    for ((i=1;i<=${#NUM_RESULTS}+1;i++)); do
+        RESULT=$(echo ${RESULTS} | cut -d '|' -f ${i})
+        RESULT_CMD=$(echo $RESULT | cut -d' ' -f1)
+        RESULT_OPTS=$(echo $RESULT | cut -d ' ' -f 2-)
+        if [ ${RESULT_CMD} == "EXIT" ]; then
+            if [ "${RESULT_OPTS}" != "$EXITCODE" ]; then
+                STATE=1
+            fi
+        elif [ ${RESULT_CMD} == "GREP" ]; then
+            f_grep "${RESULT_OPTS}"
+            STATE=$?
+        elif [ ${RESULT_CMD} == "NGREP" ]; then
+            f_ngrep "${RESULT_OPTS}"
+            STATE=$?
+        elif [ ${RESULT_CMD} == "LISTLEN" ]; then
+            f_listlen "${RESULT_OPTS}"
+            STATE=$?
+        fi
+    done
+    if [ $STATE -eq 0 ]; then
+        echo "SUCCESS : ${EXEC}" "${OPTIONS}"
+    else
+        echo "FAIL : ${EXEC}" "${OPTIONS}"
+    fi
+done < ${EXEC}.txt
+
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/likwid/likwid.git



More information about the Likwid-commit mailing list