[Likwid-commit] [likwid] 01/05: Imported Upstream version 4.1.2+dfsg1

Christoph Martin chrism at debian.org
Wed Nov 9 14:15:42 UTC 2016


This is an automated email from the git hooks/post-receive script.

chrism pushed a commit to branch master
in repository likwid.

commit 21a100b96c9387f33560c926b1b47ea576cf5fec
Author: Christoph Martin <martin at uni-mainz.de>
Date:   Wed Nov 9 10:32:09 2016 +0100

    Imported Upstream version 4.1.2+dfsg1
---
 CHANGELOG                                          |   6 +
 Makefile                                           |   2 +-
 README.md                                          |   1 +
 bench/includes/allocator.h                         |   2 +-
 bench/includes/allocator_types.h                   |   2 +-
 bench/includes/barrier.h                           |   2 +-
 bench/includes/barrier_types.h                     |   2 +-
 bench/includes/strUtil.h                           |   2 +-
 bench/includes/test_types.h                        |   2 +-
 bench/includes/threads.h                           |   2 +-
 bench/includes/threads_types.h                     |   2 +-
 bench/likwid-bench.c                               |   2 +-
 bench/src/allocator.c                              |   2 +-
 bench/src/barrier.c                                |   2 +-
 bench/src/bench.c                                  |   2 +-
 bench/src/strUtil.c                                |   2 +-
 bench/src/threads.c                                |   2 +-
 config.mk                                          |   2 +-
 doc/applications/likwid-bench.md                   |   4 +
 doc/applications/likwid-features.md                |  44 +
 doc/applications/likwid-mpirun.md                  |   4 +
 doc/applications/likwid-perfctr.md                 |   8 +
 doc/applications/likwid-perfscope.md               |  19 +-
 doc/applications/likwid-pin.md                     |   2 +-
 doc/archs/broadwell.md                             | 117 ++-
 doc/archs/broadwelld.md                            | 700 +++++++++++++++
 doc/archs/{haswellep.md => broadwellep.md}         | 620 +++++++------
 doc/archs/haswell.md                               |  98 +++
 doc/archs/haswellep.md                             |  16 +-
 doc/archs/ivybridge.md                             |  99 +++
 doc/archs/sandybridge.md                           | 100 ++-
 doc/archs/{haswell.md => skylake.md}               | 109 ++-
 doc/likwid-doxygen.md                              |  32 +-
 doc/lua-doxygen.md                                 | 974 +++++++++++++++++++--
 examples/C-likwidAPI.c                             |   2 +-
 examples/C-markerAPI.c                             |   2 +-
 examples/F-markerAPI.F90                           |   2 +-
 examples/Lua-likwidAPI.lua                         |   2 +-
 ext/hwloc/include/private/autogen/config.h         |   4 +-
 groups/broadwell/DATA.txt                          |   6 +-
 groups/broadwellD/DATA.txt                         |   6 +-
 groups/broadwellEP/DATA.txt                        |   6 +-
 groups/broadwellEP/FALSE_SHARE.txt                 |   1 +
 groups/goldmont/BRANCH.txt                         |  31 +
 .../{silvermont/ENERGY.txt => goldmont/CLOCK.txt}  |   8 +-
 groups/{broadwell => goldmont}/DATA.txt            |   6 +-
 groups/{silvermont => goldmont}/ENERGY.txt         |   8 +-
 groups/goldmont/ICACHE.txt                         |  25 +
 groups/goldmont/L2CACHE.txt                        |  34 +
 groups/goldmont/TLB_DATA.txt                       |  27 +
 groups/goldmont/TLB_INSTR.txt                      |  27 +
 groups/ivybridgeEP/UNCORECLOCK.txt                 |   2 +-
 groups/silvermont/ENERGY.txt                       |   2 +-
 perl/set_license.pl                                |   5 +-
 src/access-daemon/Makefile                         |   2 +-
 src/access-daemon/accessDaemon.c                   |   2 +-
 src/access-daemon/setFreq.c                        |   2 +-
 src/access.c                                       |   2 +-
 src/access_client.c                                |  54 ++
 src/access_x86.c                                   |  29 +
 src/access_x86_msr.c                               |   2 +-
 src/access_x86_pci.c                               |   4 +-
 src/affinity.c                                     |   2 +-
 src/applications/likwid-agent.lua                  |   2 +-
 src/applications/likwid-features.lua               |  63 +-
 src/applications/likwid-genTopoCfg.lua             |  37 +-
 src/applications/likwid-memsweeper.lua             |  37 +-
 src/applications/likwid-mpirun.lua                 | 444 ++++++----
 src/applications/likwid-perfctr.lua                | 287 ++++--
 src/applications/likwid-perfscope.lua              |  82 +-
 src/applications/likwid-pin.lua                    | 130 +--
 src/applications/likwid-powermeter.lua             | 136 +--
 src/applications/likwid-setFrequencies.lua         |  87 +-
 src/applications/likwid-topology.lua               |  54 +-
 src/applications/likwid.lua                        |  13 +-
 src/bitUtil.c                                      |   2 +-
 src/calculator.c                                   |  31 +-
 src/calculator_stack.c                             |  27 +-
 src/configuration.c                                |   2 +-
 src/cpuFeatures.c                                  |  56 +-
 src/cpustring.c                                    |  10 +-
 src/hashTable.c                                    |   2 +-
 src/includes/access.h                              |   2 +-
 src/includes/access_client.h                       |  29 +
 src/includes/access_client_types.h                 |   2 +-
 src/includes/access_x86.h                          |  29 +
 src/includes/access_x86_msr.h                      |  29 +
 src/includes/access_x86_pci.h                      |  29 +
 src/includes/affinity.h                            |   2 +-
 src/includes/bitUtil.h                             |   2 +-
 src/includes/calculator.h                          |  35 +-
 src/includes/calculator_stack.h                    |  32 +-
 src/includes/configuration.h                       |   2 +-
 src/includes/cpuFeatures.h                         |   2 +-
 src/includes/cpuFeatures_types.h                   |   2 +-
 src/includes/cpuid.h                               |   4 +-
 src/includes/error.h                               |   2 +-
 src/includes/hashTable.h                           |   2 +-
 src/includes/libperfctr_types.h                    |   2 +-
 src/includes/likwid.h                              |  11 +-
 src/includes/lock.h                                |   2 +-
 src/includes/memsweep.h                            |   2 +-
 src/includes/numa.h                                |   2 +-
 src/includes/numa_hwloc.h                          |   2 +-
 src/includes/numa_proc.h                           |   2 +-
 src/includes/pci_hwloc.h                           |   2 +-
 src/includes/pci_proc.h                            |   2 +-
 src/includes/pci_types.h                           |   2 +-
 src/includes/perfgroup.h                           |   4 +-
 src/includes/perfmon.h                             |   2 +-
 src/includes/perfmon_atom.h                        |   2 +-
 src/includes/perfmon_atom_events.txt               |   2 +-
 src/includes/perfmon_broadwell.h                   | 176 ++--
 src/includes/perfmon_broadwellEP_counters.h        |   7 +-
 src/includes/perfmon_broadwellEP_events.txt        |   8 +-
 src/includes/perfmon_broadwell_counters.h          |   2 +-
 src/includes/perfmon_broadwell_events.txt          |   2 +-
 src/includes/perfmon_broadwelld_counters.h         |   4 +-
 src/includes/perfmon_broadwelld_events.txt         |   2 +-
 src/includes/perfmon_core2.h                       |   4 +-
 src/includes/perfmon_core2_counters.h              |   2 +-
 src/includes/perfmon_core2_events.txt              |   2 +-
 .../{perfmon_skylake.h => perfmon_goldmont.h}      | 261 +-----
 src/includes/perfmon_goldmont_counters.h           |  65 ++
 src/includes/perfmon_goldmont_events.txt           | 211 +++++
 src/includes/perfmon_haswell.h                     |  54 +-
 src/includes/perfmon_haswellEP_counters.h          |   6 +-
 src/includes/perfmon_haswellEP_events.txt          |   2 +-
 src/includes/perfmon_haswell_counters.h            |   2 +-
 src/includes/perfmon_haswell_events.txt            |   2 +-
 src/includes/perfmon_interlagos.h                  |   4 +-
 src/includes/perfmon_interlagos_counters.h         |   2 +-
 src/includes/perfmon_interlagos_events.txt         |   2 +-
 src/includes/perfmon_ivybridge.h                   | 246 +++---
 src/includes/perfmon_ivybridgeEP_counters.h        |   2 +-
 src/includes/perfmon_ivybridgeEP_events.txt        |   8 +-
 src/includes/perfmon_ivybridge_counters.h          |   2 +-
 src/includes/perfmon_ivybridge_events.txt          |   5 +-
 src/includes/perfmon_k10.h                         |   4 +-
 src/includes/perfmon_k10_counters.h                |   2 +-
 src/includes/perfmon_k10_events.txt                |   2 +-
 src/includes/perfmon_k8.h                          |   2 +-
 src/includes/perfmon_k8_events.txt                 |   2 +-
 src/includes/perfmon_kabini.h                      |   4 +-
 src/includes/perfmon_kabini_counters.h             |   2 +-
 src/includes/perfmon_kabini_events.txt             |   2 +-
 src/includes/perfmon_nehalem.h                     | 106 ++-
 src/includes/perfmon_nehalemEX.h                   |  27 +-
 src/includes/perfmon_nehalemEX_counters.h          |   2 +-
 src/includes/perfmon_nehalemEX_events.txt          |   2 +-
 src/includes/perfmon_nehalemEX_westmereEX_common.h |  30 +
 src/includes/perfmon_nehalem_counters.h            |   2 +-
 src/includes/perfmon_nehalem_events.txt            |   2 +-
 src/includes/perfmon_p6_events.txt                 |   2 +-
 src/includes/perfmon_perf.h                        |   4 +-
 src/includes/perfmon_phi.h                         |   4 +-
 src/includes/perfmon_phi_counters.h                |   2 +-
 src/includes/perfmon_phi_events.txt                |   2 +-
 src/includes/perfmon_pm.h                          |   4 +-
 src/includes/perfmon_pm_counters.h                 |   2 +-
 src/includes/perfmon_pm_events.txt                 |   2 +-
 src/includes/perfmon_sandybridge.h                 | 122 ++-
 src/includes/perfmon_sandybridgeEP_counters.h      |   6 +-
 src/includes/perfmon_sandybridgeEP_events.txt      |  11 +-
 src/includes/perfmon_sandybridge_counters.h        |   2 +-
 src/includes/perfmon_sandybridge_events.txt        |   2 +-
 src/includes/perfmon_silvermont.h                  |   9 +-
 src/includes/perfmon_silvermont_counters.h         |   2 +-
 src/includes/perfmon_silvermont_events.txt         |   2 +-
 src/includes/perfmon_skylake.h                     | 101 ++-
 src/includes/perfmon_skylake_counters.h            |  38 +-
 src/includes/perfmon_skylake_events.txt            |   2 +-
 src/includes/perfmon_types.h                       |   2 +-
 src/includes/perfmon_westmere.h                    |   2 +-
 src/includes/perfmon_westmereEX.h                  |  31 +-
 src/includes/perfmon_westmereEX_counters.h         |   2 +-
 src/includes/perfmon_westmereEX_events.txt         |   2 +-
 src/includes/perfmon_westmere_events.txt           |   2 +-
 src/includes/power.h                               |   2 +-
 src/includes/power_types.h                         |   2 +-
 src/includes/registers.h                           |   2 +-
 src/includes/registers_types.h                     |   8 +-
 src/includes/textcolor.h                           |   2 +-
 src/includes/thermal.h                             |   2 +-
 src/includes/thermal_types.h                       |   2 +-
 src/includes/timer.h                               |   2 +-
 src/includes/timer_types.h                         |   2 +-
 src/includes/tlb-info.h                            |   2 +-
 src/includes/topology.h                            |   2 +-
 src/includes/topology_cpuid.h                      |   2 +-
 src/includes/topology_hwloc.h                      |   2 +-
 src/includes/topology_proc.h                       |   2 +-
 src/includes/topology_types.h                      |   2 +-
 src/includes/tree.h                                |   2 +-
 src/includes/tree_types.h                          |   2 +-
 src/includes/types.h                               |   2 +-
 src/libperfctr.c                                   |   2 +-
 src/likwid.f90                                     |   2 +-
 src/likwid_f90_interface.c                         |   2 +-
 src/luawid.c                                       |  30 +-
 src/memsweep.c                                     |  16 +-
 src/numa.c                                         |   2 +-
 src/numa_hwloc.c                                   |   2 +-
 src/numa_proc.c                                    |   2 +-
 src/pci_hwloc.c                                    |   2 +-
 src/pci_proc.c                                     |   2 +-
 src/perfgroup.c                                    |   2 +-
 src/perfmon.c                                      |  68 +-
 src/perfmon_perf.c                                 |   2 +-
 src/power.c                                        |   4 +-
 src/pthread-overload/Makefile                      |   2 +-
 src/pthread-overload/pthread-overload.c            |   2 +-
 src/thermal.c                                      |   2 +-
 src/timer.c                                        |   2 +-
 src/topology.c                                     |   3 +-
 src/topology_cpuid.c                               |   2 +-
 src/topology_hwloc.c                               |  89 +-
 src/topology_proc.c                                |  13 +-
 src/tree.c                                         |   2 +-
 test/MPI_pin_test.c                                |   4 +-
 test/accuracy/TESTS/FLOPS_AVX.txt                  |   2 +-
 test/accuracy/TESTS/FLOPS_DP.txt                   |   2 +-
 test/accuracy/TESTS/FLOPS_SP.txt                   |   2 +-
 test/accuracy/likwid-accuracy.py                   |   4 +-
 224 files changed, 5081 insertions(+), 1900 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 6c84df0..a1cf597 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,9 @@
+# Changelog 4.1.1
+- Fix for Uncore handling for EP/EN/EX systems
+- Minor fix for Uncore handling for Intel desktop systems
+- Fix in generic readCounters function
+- Support for Intel Goldmont (untested)
+- Fixes for likwid-mpirun
 
 
 # Changelog 4.1.0
diff --git a/Makefile b/Makefile
index e1f959b..43e9fac 100644
--- a/Makefile
+++ b/Makefile
@@ -541,7 +541,7 @@ local: $(L_APPS) likwid.lua
 	@echo "===> Setting Lua scripts to run from current directory"
 	@PWD=$(shell pwd)
 	@for APP in $(L_APPS); do \
-		sed -i -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s/<RELEASE>/$(RELEASE)/g" -e "s+$(PREFIX)/bin/likwid-lua+$(PWD)/ext/lua/lua+" -e "s+$(PREFIX)/share/lua/?.lua+$(PWD)/?.lua+" $$APP; \
+		sed -i -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s/<RELEASE>/$(RELEASE)/g" -e "s+$(PREFIX)/bin/likwid-lua+$(PWD)/ext/lua/lua+" -e "s+$(PREFIX)/share/lua/?.lua+$(PWD)/?.lua+" -e "s+$(PREFIX)/bin/likwid-pin+$(PWD)/likwid-pin+" -e "s+$(PREFIX)/bin/likwid-perfctr+$(PWD)/likwid-perfctr+" $$APP; \
 		chmod +x $$APP; \
 	done
 	@sed -i -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s/<RELEASE>/$(RELEASE)/g" -e "s+$(PREFIX)/lib+$(PWD)+g" -e "s+$(PREFIX)/share/likwid/perfgroups+$(PWD)/groups+g" likwid.lua;
diff --git a/README.md b/README.md
index 838883c..6626763 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@ It consists of:
 - likwid-powermeter: read out RAPL Energy information and get info about Turbo mode steps
 - likwid-pin: pin your threaded application (pthread, Intel and gcc OpenMP to dedicated processors)
 - likwid-bench: Micro benchmarking platform
+- likwid-features: Print and manipulate cpu features like hardware prefetchers
 - likwid-genTopoCfg: Dumps topology information to a file
 - likwid-mpirun: Wrapper to start MPI and Hybrid MPI/OpenMP applications (Supports Intel MPI, OpenMPI and MPICH)
 - likwid-perfscope: Frontend to the timeline mode of likwid-perfctr, plots live graphs of performance metrics using gnuplot
diff --git a/bench/includes/allocator.h b/bench/includes/allocator.h
index f7eae06..76df396 100644
--- a/bench/includes/allocator.h
+++ b/bench/includes/allocator.h
@@ -6,7 +6,7 @@
  *      Description:  Header File allocator Module. 
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  none
diff --git a/bench/includes/allocator_types.h b/bench/includes/allocator_types.h
index 43ad3c0..c73a125 100644
--- a/bench/includes/allocator_types.h
+++ b/bench/includes/allocator_types.h
@@ -6,7 +6,7 @@
  *      Description:  Header File types of allocator Module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  none
diff --git a/bench/includes/barrier.h b/bench/includes/barrier.h
index 6427c4a..41abafa 100644
--- a/bench/includes/barrier.h
+++ b/bench/includes/barrier.h
@@ -6,7 +6,7 @@
  *      Description:  Header File barrier Module
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/bench/includes/barrier_types.h b/bench/includes/barrier_types.h
index 9fc6e30..2434299 100644
--- a/bench/includes/barrier_types.h
+++ b/bench/includes/barrier_types.h
@@ -6,7 +6,7 @@
  *      Description:  Type Definitions for barrier Module
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/bench/includes/strUtil.h b/bench/includes/strUtil.h
index a16790c..4b02ea8 100644
--- a/bench/includes/strUtil.h
+++ b/bench/includes/strUtil.h
@@ -5,7 +5,7 @@
  *      Description:  Some sting functions
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/bench/includes/test_types.h b/bench/includes/test_types.h
index 18627fc..9d4da1b 100644
--- a/bench/includes/test_types.h
+++ b/bench/includes/test_types.h
@@ -6,7 +6,7 @@
  *      Description:  Type definitions for benchmarking framework
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/bench/includes/threads.h b/bench/includes/threads.h
index d92bbc9..7693be5 100644
--- a/bench/includes/threads.h
+++ b/bench/includes/threads.h
@@ -6,7 +6,7 @@
  *      Description:  Header file of pthread interface module
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/bench/includes/threads_types.h b/bench/includes/threads_types.h
index 68f0af3..aa51ca1 100644
--- a/bench/includes/threads_types.h
+++ b/bench/includes/threads_types.h
@@ -6,7 +6,7 @@
  *      Description:  Types file for threads module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/bench/likwid-bench.c b/bench/likwid-bench.c
index 02d0ced..37d40e6 100644
--- a/bench/likwid-bench.c
+++ b/bench/likwid-bench.c
@@ -6,7 +6,7 @@
  *      Description:  A flexible and extensible benchmarking toolbox
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/bench/src/allocator.c b/bench/src/allocator.c
index ea0be48..290a6b1 100644
--- a/bench/src/allocator.c
+++ b/bench/src/allocator.c
@@ -6,7 +6,7 @@
  *      Description:  Implementation of allocator module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/bench/src/barrier.c b/bench/src/barrier.c
index 4b0e344..b536ff3 100644
--- a/bench/src/barrier.c
+++ b/bench/src/barrier.c
@@ -6,7 +6,7 @@
  *      Description:  Implementation of threaded spin loop barrier
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/bench/src/bench.c b/bench/src/bench.c
index e1e1a97..3cbfb54 100644
--- a/bench/src/bench.c
+++ b/bench/src/bench.c
@@ -6,7 +6,7 @@
  *      Description:  Benchmarking framework for likwid-bench
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *               Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/bench/src/strUtil.c b/bench/src/strUtil.c
index 8a4c429..93d4630 100644
--- a/bench/src/strUtil.c
+++ b/bench/src/strUtil.c
@@ -6,7 +6,7 @@
  *      Description:  Utility string routines building upon bstrlib
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com.
  *      Project:  likwid
diff --git a/bench/src/threads.c b/bench/src/threads.c
index 70a90ec..df506e9 100644
--- a/bench/src/threads.c
+++ b/bench/src/threads.c
@@ -6,7 +6,7 @@
  *      Description:  High level interface to pthreads
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/config.mk b/config.mk
index 46fbe78..9c5d7cd 100644
--- a/config.mk
+++ b/config.mk
@@ -77,7 +77,7 @@ TOPO_FILE_PATH = /etc/likwid_topo.cfg
 # Versioning Information
 VERSION = 4
 RELEASE = 1
-DATE    = 19.05.2016
+DATE    = 08.08.2016
 
 RPATHS = -Wl,-rpath=$(INSTALLED_LIBPREFIX)
 LIBLIKWIDPIN = $(abspath $(INSTALLED_PREFIX)/lib/liblikwidpin.so.$(VERSION).$(RELEASE))
diff --git a/doc/applications/likwid-bench.md b/doc/applications/likwid-bench.md
index fc642e1..1d371ee 100644
--- a/doc/applications/likwid-bench.md
+++ b/doc/applications/likwid-bench.md
@@ -24,6 +24,10 @@
   <TD>List all available thread affinity domains</TD>
 </TR>
 <TR>
+  <TD>-i <iters></TD>
+  <TD>Use <iters> iterations of the benchmark kernel</TD>
+</TR>
+<TR>
   <TD>-d <delim></TD>
   <TD>Use <delim> instead of ',' for the output of -p</TD>
 </TR>
diff --git a/doc/applications/likwid-features.md b/doc/applications/likwid-features.md
new file mode 100644
index 0000000..14ed37e
--- /dev/null
+++ b/doc/applications/likwid-features.md
@@ -0,0 +1,44 @@
+/*! \page likwid-features <CODE>likwid-features</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-features</CODE> is a command line application to print and change some features of the CPU. Most of the features cannot be changed during runtime, but the prefetchers <CODE>HW_PREFETCHER</CODE>, <CODE>CL_PREFETCHER</CODE>, <CODE>DCU_PREFETCHER</CODE>, <CODE>IP_PREFETCHER</CODE> can be altered.<BR>
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information</TD>
+</TR>
+<TR>
+  <TD>-a, --all</TD>
+  <TD>List all features</TD>
+</TR>
+<TR>
+  <TD>-c, --cpus <arg></TD>
+  <TD>Define the CPUs that should be modified. For information about the syntax see \ref CPU_expressions on the \ref likwid-pin page</TD>
+</TR>
+<TR>
+  <TD>-l, --list</TD>
+  <TD>List all features with current state for the CPUs defined at -c/--cpus</TD>
+</TR>
+<TR>
+  <TD>-e, --enable <list></TD>
+  <TD>Comma-separated list of features that should be enabled</TD>
+</TR>
+<TR>
+  <TD>-d, --disable <list></TD>
+  <TD>Comma-separated list of features that should be disabled</TD>
+</TR>
+</TABLE>
+
+
+
+*/
diff --git a/doc/applications/likwid-mpirun.md b/doc/applications/likwid-mpirun.md
index aee12d6..86f938c 100644
--- a/doc/applications/likwid-mpirun.md
+++ b/doc/applications/likwid-mpirun.md
@@ -62,6 +62,10 @@ A tool to start and monitor MPI applications with LIKWID. It can be used as supp
   <TD>-O</TD>
   <TD>Print results in CSV format (conform to <A HREF="https://tools.ietf.org/html/rfc4180">RFC 4180</A>)</TD>
 </TR>
+<TR>
+  <TD>-f</TD>
+  <TD>Configure events even if the counter is already in use.</TD>
+</TR>
 </TABLE>
 
 <H1>Examples</H1>
diff --git a/doc/applications/likwid-perfctr.md b/doc/applications/likwid-perfctr.md
index 9efc789..38ea6fb 100644
--- a/doc/applications/likwid-perfctr.md
+++ b/doc/applications/likwid-perfctr.md
@@ -103,9 +103,17 @@ custom event sets. The \ref Marker_API can measure mulitple named regions and th
   <TD>Print output in CSV format (conform to <A HREF="https://tools.ietf.org/html/rfc4180">RFC 4180</A>). The output contains some markers that help to parse the output.</TD>
 </TR>
 <TR>
+  <TD>-f, --force</TD>
+  <TD>Configure events even if the counter registers are already in use.</TD>
+</TR>
+<TR>
   <TD>-s, --skip <arg></TD>
   <TD>'arg' must be a bitmask in hex. Threads with the ID equal to a set bit in bitmask will be skipped during pinning<BR>Example: 0x1 = Thread 0 is skipped.</TD>
 </TR>
+<TR>
+  <TD>--stats</TD>
+  <TD>Always print the statistics table.</TD>
+</TR>
 </TABLE>
 
 <H1>Examples</H1>
diff --git a/doc/applications/likwid-perfscope.md b/doc/applications/likwid-perfscope.md
index 71c8984..5e05727 100644
--- a/doc/applications/likwid-perfscope.md
+++ b/doc/applications/likwid-perfscope.md
@@ -44,7 +44,7 @@ to create on-the-fly pictures with the current measurements. It uses the <A HREF
 </TR>
 <TR>
   <TD>-g, --group <arg></TD>
-  <TD>Specify a predefined plot with optional changes or an eventset with plot configuration. See \ref plot_configuration for details.</TD>
+  <TD>Specify a predefined plot with optional changes or an eventset.</TD>
 </TR>
 <TR>
   <TD>-r, --range <arg></TD>
@@ -56,6 +56,7 @@ to create on-the-fly pictures with the current measurements. It uses the <A HREF
 </TR>
 </TABLE>
 
+<!---
 \anchor plot_configuration
 <H1>Plot configurations</H1>
 <CODE>likwid-perfscope</CODE> extends the format of the eventset option of \ref likwid-perfctr to make it more conveniet for the users. It accepts either a plot configuration of interesting metrics which are embedded into <CODE>likwid-perfscope</CODE> or a custom eventset suitable for \ref likwid-perfctr extended by the plot configuration. A plot configuration can be set with key=value pairs separated by ':' and has to contain at least a definition of a formula for plotting. If specifyed [...]
@@ -87,20 +88,18 @@ to create on-the-fly pictures with the current measurements. It uses the <A HREF
   <TD>Use <string> as label for the right y-axis. If <id-string> is given, the formula with id is associated with the y2-axis. If used with predefined plot configurations, be aware that the formula 1 is part of the plot configuration. If no id is given, the y2-axis is associated with the last given formula. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
 </TR>
 </TABLE>
+-->
 
 <H1>Examples</H1>
 <UL>
-<LI><CODE>likwid-perfscope -g L3_BAND -C 0-2 -t 1s ./a.out</CODE><BR>
-Pin the executable <CODE>./a.out</CODE> to CPUs 0,1,2 and use the predefined plot configuration <CODE>L3_BAND</CODE> The plot is updated ever second.
+<LI><CODE>likwid-perfscope -g L3 -C 0-2 -t 1s ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0,1,2 and use the predefined plot configuration <CODE>L3</CODE> The plot is updated ever second.
 </LI>
-<LI><CODE>likwid-perfscope -g L3_BAND:TITLE="My Title" -C S0:1 -t 500ms ./a.out</CODE><BR>
-Pin the executable <CODE>./a.out</CODE> to CPU 1 on Socket 0 and use the predefined plot configuration <CODE>L3_BAND</CODE> but change the title for the plot to "My Title".
+<LI><CODE>likwid-perfscope -g L3 -C S0:1 -t 500ms ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPU 1 on Socket 0 and use the predefined plot configuration <CODE>L3</CODE>.
 </LI>
-<LI><CODE>likwid-perfscope -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,CPI=FIXC0/FIXC1:YTITLE="CPI" -C 0 --time 2s ./a.out</CODE><BR>
-Pin the executable <CODE>./a.out</CODE> to CPUs 0 and use the custom event set <CODE>INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1</CODE>. The last event set entry specifies custom plot options. The plotted formula is <CODE>FIXC0/FIXC1</CODE> and the plot title and legend entry is set to 'CPI'.
-</LI>
-<LI><CODE>likwid-perfscope -g L3_BAND,CPI=FIXC0/FIXC1:Y2TITLE="2-Cycles per Instruction" -C 0 --time 2s ./a.out</CODE><BR>
-Pin the executable <CODE>./a.out</CODE> to CPU 0 and use the predefined plot configuration  <CODE>L3_BAND</CODE> to measure every 2 seconds. Additionally, a formula <CODE>FIXC0/FIXC1</CODE> with the name <CODE>CPI</CODE> is given. The right y-axis is associated to the given function and labeled with <CODE>Cycles per Instruction</CODE>. The formula ID 2 is not needed in this case as the default behavior is to associate the right y-axis to the last formula given.
+<LI><CODE>likwid-perfscope -g ENERGY -C S0:1 -r 10 -t 500ms ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPU 1 on Socket 0 and use the predefined plot configuration <CODE>ENERGY</CODE>. Use a sliding window that contains 10 data points.
 </LI>
 </UL>
 
diff --git a/doc/applications/likwid-pin.md b/doc/applications/likwid-pin.md
index b8c8a1e..68592ca 100644
--- a/doc/applications/likwid-pin.md
+++ b/doc/applications/likwid-pin.md
@@ -74,7 +74,7 @@ For applications where first touch policy on NUMA systems cannot be employed <CO
   <TD>'arg' must be a bitmask in hex. Threads with the ID equal to a set bit in bitmask will be skipped during pinning<BR>Example: 0x1 = Thread 0 is skipped.</TD>
 </TR>
 <TR>
-  <TD>-d</TD>
+  <TD>-d <delim></TD>
   <TD>Set the delimiter for the output of -p. Default is ','</TD>
 </TR>
 </TABLE>
diff --git a/doc/archs/broadwell.md b/doc/archs/broadwell.md
index ff207af..9d1d029 100644
--- a/doc/archs/broadwell.md
+++ b/doc/archs/broadwell.md
@@ -1,17 +1,19 @@
 /*! \page broadwell Intel® Broadwell
 
-<P>This page is valid for Broadwell, Broadwell single socket server (Xeon D) and Broadwell EP/EN/EX. No Uncore support by now, no documentation is available for the Uncore counters of Broadwell</P>
+<P>This page is valid for Broadwell. The Broadwell microarchitecture supports the UBOX and the CBOX Uncore devices.</P>
 
 <H1>Available performance monitors for the Intel® Broadwell microarchitecture</H1>
 <UL>
-<LI>\ref BRD_FIXED "Fixed-purpose counters"</LI>
-<LI>\ref BRD_PMC "General-purpose counters"</LI>
-<LI>\ref BRD_THERMAL "Thermal counters"</LI>
-<LI>\ref BRD_POWER "Power measurement counters"</LI>
+<LI>\ref BDW_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref BDW_PMC "General-purpose counters"</LI>
+<LI>\ref BDW_THERMAL "Thermal counters"</LI>
+<LI>\ref BDW_POWER "Power measurement counters"</LI>
+<LI>\ref BDW_UBOX "Uncore global counters"</LI>
+<LI>\ref BDW_CBOX "Last level cache counters"</LI>
 </UL>
 
 <H1>Counters available for each hardware thread</H1>
-\anchor BRD_FIXED
+\anchor BDW_FIXED
 <H2>Fixed-purpose counters</H2>
 <P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
 <H3>Counter and events</H3>
@@ -55,7 +57,7 @@
 </TR>
 </TABLE>
 
-\anchor BRD_PMC
+\anchor BDW_PMC
 <H2>General-purpose counters</H2>
 <P>Commonly the Intel® Broadwell microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
 <H3>Counter and events</H3>
@@ -157,9 +159,9 @@
 </TABLE>
 <P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility for performance monitoring. Although we can program it from user-space, the results are always 0.</P>
 
-\anchor BRD_THERMAL
+\anchor BDW_THERMAL
 <H2>Thermal counter</H2>
-<P>The Intel® Haswell microarchitecture provides one register for the current core temperature.</P>
+<P>The Intel® Broadwell microarchitecture provides one register for the current core temperature.</P>
 <H3>Counter and events</H3>
 <TABLE>
 <TR>
@@ -173,7 +175,7 @@
 </TABLE>
 
 <H1>Counters available for one hardware thread per socket</H1>
-\anchor BRD_POWER
+\anchor BDW_POWER
 <H2>Power counter</H2>
 <P>The Intel® Broadwell microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
 <H3>Counter and events</H3>
@@ -200,4 +202,99 @@
 </TR>
 </TABLE>
 
+\anchor BDW_UBOX
+<H2>Uncore global counters</H2>
+<P>The Intel® Broadwell microarchitecture provides measurements for the global uncore.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOXFIX</TD>
+  <TD>UNCORE_CLOCK</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor BDW_CBOX
+<H2>Last level cache counters</H2>
+<P>The Intel® Broadwell microarchitecture provides measurements for the last level cache segments.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-3>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-3>C1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
 */
diff --git a/doc/archs/broadwelld.md b/doc/archs/broadwelld.md
new file mode 100644
index 0000000..894042f
--- /dev/null
+++ b/doc/archs/broadwelld.md
@@ -0,0 +1,700 @@
+/*! \page broadwelld Intel® Broadwell D
+
+<P>This page is valid for Broadwell D.</P>
+
+<H1>Available performance monitors for the Intel® Broadwell D microarchitecture</H1>
+<UL>
+<LI>\ref BDW_DE_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref BDW_DE_PMC "General-purpose counters"</LI>
+<LI>\ref BDW_DE_THERMAL "Thermal counters"</LI>
+<LI>\ref BDW_DE_POWER "Power measurement counters"</LI>
+<LI>\ref BDW_DE_UBOX "Uncore global counters"</LI>
+<LI>\ref BDW_DE_CBOX "Last level cache counters"</LI>
+<LI>\ref BDW_DE_BBOX "Home Agent counters"</LI>
+<LI>\ref BDW_DE_WBOX "Power control unit counters"</LI>
+<LI>\ref BDW_DE_IBOX "Coherency for IIO traffic counters"</LI>
+<LI>\ref BDW_DE_MBOX "Integrated memory controller counters"</LI>
+<LI>\ref BDW_DE_PBOX "Ring-to-PCIe interface counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor BDW_DE_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor BDW_DE_PMC
+<H2>General-purpose counters</H2>
+<P>Commonly the Intel® Broadwell D microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>in_transaction</TD>
+  <TD>N</TD>
+  <TD>Set bit 32 in config register</TD>
+  <TD>Only available if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+<TR>
+  <TD>in_transaction_aborted</TD>
+  <TD>N</TD>
+  <TD>Set bit 33 in config register</TD>
+  <TD>Only counter PMC2 and only if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Broadwell D microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Broadwell D microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/BDW-DE">https://download.01.org/perfmon/BDW-DE</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/BDW-DE">https://download.01.org/perfmon/BDW-DE</A>.</TD>
+</TR>
+</TABLE>
+<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility for performance monitoring. Although we can program it from user-space, the results are always 0.</P>
+
+\anchor BDW_DE_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Broadwell microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor BDW_DE_POWER
+<H2>Power counter</H2>
+<P>The Intel® Broadwell microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+\anchor BDW_DE_UBOX
+<H2>Uncore management counters</H2>
+<P>The Intel® Broadwell D microarchitecture provides measurements of the management box in the Uncore. The description from Intel®:<BR>
+<I>The UBox serves as the system configuration controller for the Intel® Xeon® Processor D-1500 Product Family. In this capacity, the UBox acts as the central unit for a variety of functions:
+<UL>
+<LI>The master for reading and writing physically distributed registers across using the Message Channel.</LI>
+<LI>The UBox is the intermediary for interrupt traffic, receiving interrupts from the system and dispatching interrupts to the appropriate core.</LI>
+<LI>The UBox serves as the system lock master used when quiescing the platform (e.g., Intel® QPI bus lock).</LI>
+</UL>
+</I><BR>
+The Uncore management performance counters are exposed to the operating system through the MSR interface. The name UBOX originates from the Nehalem EX Uncore monitoring where those functional units are called UBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOXFIX</TD>
+  <TD>UBOX_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for UBOX<0,1> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor BDW_DE_CBOX
+<H2>Last level cache counters</H2>
+<P>The Intel® Broadwell microarchitecture provides measurements for the last level cache segments.The description from Intel®:<BR>
+<I>The LLC coherence engine (CBo) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a CBo via the ring interconnect. The CBo is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the
+LLC; generating snoops and collecting snoop responses from the local cores when the MESIF protocol requires it.
+</I></P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-15>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-15>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-15>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-15>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>tid</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 0-5 in MSR_UNC_C<0-15>_PMON_BOX_FILTER register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>state</TD>
+  <TD>7 bit hex value</TD>
+  <TD>Set bits 17-23 in MSR_UNC_C<0-15>_PMON_BOX_FILTER register</TD>
+  <TD>M': 0x40, D: 0x20, F: 0x10, M: 0x08, E: 0x04, S: 0x02, I: 0x01</TD>
+</TR>
+<TR>
+  <TD>nid</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Set bits 0-15 in MSR_UNC_C<0-15>_PMON_BOX_FILTER1 register</TD>
+  <TD>Note: Node 0 has value 0x0001</TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>9 bit hex value</TD>
+  <TD>Set bits 20-28 in MSR_UNC_C<0-15>_PMON_BOX_FILTER1 register</TD>
+  <TD>A list of valid opcodes can be found in the <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-d-1500-uncore-performance-monitoring.html">Intel® Xeon D-1500 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>2 bit hex address</TD>
+  <TD>Set bits 30-31 in MSR_UNC_C<0-15>_PMON_BOX_FILTER1 register</TD>
+  <TD>See the <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-d-1500-uncore-performance-monitoring.html">Intel® Xeon D-1500 Uncore Manual</A> for more information.</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>The Intel® Broadwell D microarchitecture provides an event LLC_LOOKUP which can be filtered with the 'state' option. If no 'state' is set, LIKWID sets the state to 0x1F, the default value to measure all lookups.</P>
+
+\anchor BDW_DE_BBOX
+<H2>Home Agent counters</H2>
+<P>The Intel® Broadwell D microarchitecture provides measurements of the Home Agent (HA) in the Uncore. The description from Intel®:<BR>
+<I>Each HA is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the HA is responsible for ordering memory reads/writes, coming in from the modular Ring, to a given address such that the IMC (memory controller).
+</I><BR>
+The Home Agent performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the HA. For systems where each socket has 12 or more cores, there are both HAs available. The name BBOX originates from the Nehalem EX Uncore monitoring where this functional unit is called BBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>46 bit hex address</TD>
+  <TD>Extract bits 6-31 and set bits 6-31 in PCI_UNC_HA_PMON_ADDRMATCH0 register of PCI device<BR>Extract bits 32-45 and set bits 0-13 in PCI_UNC_HA_PMON_ADDRMATCH1 register of PCI device</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor BDW_DE_WBOX
+<H2>Power control unit counters</H2>
+<P>The Intel® Broadwell D microarchitecture provides measurements of the power control unit (PCU) in the Uncore. The description from Intel®:<BR>
+<I>The PCU is the primary Power Controller for the Intel® Xeon® Processor D-1500 Product Family.<BR>
+The uncore implements a power control unit acting as a core/uncore power and thermal manager. It runs its firmware on an internal microcontroller and coordinates the socket’s power states.
+</I><BR>
+The PCU performance counters are exposed to the operating system through the MSR interface. The name WBOX originates from the Nehalem EX Uncore monitoring where those functional units are called WBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>WBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX0FIX</TD>
+  <TD>CORES_IN_C3</TD>
+</TR>
+<TR>
+  <TD>WBOX1FIX</TD>
+  <TD>CORES_IN_C6</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>occupancy</TD>
+  <TD>2 bit hex value</TD>
+  <TD>Set bit 14-15 in config register</TD>
+  <TD>Cores in C0: 0x1, in C3: 0x2, in C6: 0x3</TD>
+</TR>
+<TR>
+  <TD>occupancy_filter</TD>
+  <TD>32 bit hex value</TD>
+  <TD>Set bits 0-31 in MSR_UNC_PCU_PMON_BOX_FILTER register</TD>
+  <TD>Band0: bits 0-7, Band1: bits 8-15, Band2: bits 16-23, Band3: bits 24-31</TD>
+</TR>
+<TR>
+  <TD>occupancy_edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>occupancy_invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 30 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor BDW_DE_IBOX
+<H2>IRP box counters</H2>
+<P>The Intel® Broadwell D microarchitecture provides measurements of the IRP box in the Uncore. The description from Intel®:<BR>
+<I>IRP is responsible for maintaining coherency for IIO traffic that needs to be coherent (e.g. cross-socket P2P).
+</I>
+
+The IRP box counters are exposed to the operating system through the PCI interface. The IBOX was introduced with the Intel® IvyBridge EP/EN/EX microarchitecture.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>IBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>IBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor BDW_DE_MBOX
+<H2>Memory controller counters</H2>
+<P>The Intel® Broadwell D microarchitecture provides measurements of the integrated Memory Controllers (iMC) in the Uncore. The description from Intel®:<BR>
+<I>The Intel® Xeon® Processor D-1500 Product Family integrated Memory Controller provides the interface to DRAM and communicates to the rest of the Uncore through the Home Agent (i.e. the IMC does not connect to the Ring).
+
+<BR>
+In conjunction with the HA, the memory controller also provides a variety of RAS features.
+
+</I><BR>
+The integrated Memory Controllers performance counters are exposed to the operating system through PCI interfaces. There may be two memory controllers in the system. There are 4 different PCI devices per memory controller, but only 2 channels. Each channel has 4 different general-purpose counters and one fixed counter for the DRAM clock. The channels of the first memory controller are MBOX0-3, the four channels of the second memory controller (if available) are named MBOX4-7. The name MB [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>FIX</TD>
+  <TD>DRAM_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter MBOX<0-7>C<0-3>)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor BDW_DE_PBOX
+<H2>Ring-to-PCIe counters</H2>
+<P>The Intel® Broadwell D microarchitecture provides measurements of the Ring-to-PCIe (R2PCIe) interface in the Uncore. The description from Intel®:<BR>
+<I>R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe.</I><BR>
+The Ring-to-PCIe performance counters are exposed to the operating system through a PCI interface. Independent of the system's configuration, there is only one Ring-to-PCIe interface per CPU socket.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/haswellep.md b/doc/archs/broadwellep.md
similarity index 59%
copy from doc/archs/haswellep.md
copy to doc/archs/broadwellep.md
index 9368c54..8b2491e 100644
--- a/doc/archs/haswellep.md
+++ b/doc/archs/broadwellep.md
@@ -1,26 +1,27 @@
-/*! \page haswellep Intel® Haswell EP/EN/EX
+/*! \page broadwellep Intel® Broadwell EP/EN/EX
 
+<P>This page is valid for Broadwell EP/EN/EX.</P>
 
-<H1>Available performance monitors for the Intel® Haswell EP/EN/EX microarchitecture</H1>
+<H1>Available performance monitors for the Intel® Broadwell EP/EN/EX microarchitecture</H1>
 <UL>
-<LI>\ref HASEP_FIXED "Fixed-purpose counters"</LI>
-<LI>\ref HASEP_PMC "General-purpose counters"</LI>
-<LI>\ref HASEP_THERMAL "Thermal counters"</LI>
-<LI>\ref HASEP_POWER "Power measurement counters"</LI>
-<LI>\ref HASEP_BBOX "Home Agent counters"</LI>
-<LI>\ref HASEP_SBOX "Ring transfer counters"</LI>
-<LI>\ref HASEP_QBOX "Intel® QPI Link Layer counters"</LI>
-<LI>\ref HASEP_CBOX "Last Level cache counters"</LI>
-<LI>\ref HASEP_UBOX "Uncore management counters"</LI>
-<LI>\ref HASEP_WBOX "Power control unit counters"</LI>
-<LI>\ref HASEP_IBOX "Coherency for IIO traffic counters"</LI>
-<LI>\ref HASEP_MBOX "Integrated memory controller counters"</LI>
-<LI>\ref HASEP_RBOX "Ring-to-QPI interface counters"</LI>
-<LI>\ref HASEP_PBOX "Ring-to-PCIe interface counters"</LI>
+<LI>\ref BDX_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref BDX_PMC "General-purpose counters"</LI>
+<LI>\ref BDX_THERMAL "Thermal counters"</LI>
+<LI>\ref BDX_POWER "Power measurement counters"</LI>
+<LI>\ref BDX_UBOX "Uncore global counters"</LI>
+<LI>\ref BDX_CBOX "Last level cache counters"</LI>
+<LI>\ref BDX_BBOX "Home Agent counters"</LI>
+<LI>\ref BDX_WBOX "Power control unit counters"</LI>
+<LI>\ref BDX_IBOX "Coherency for IIO traffic counters"</LI>
+<LI>\ref BDX_SBOX "Ring transfer counters"</LI>
+<LI>\ref BDX_QBOX "Intel® QPI Link Layer counters"</LI>
+<LI>\ref BDX_MBOX "Integrated memory controller counters"</LI>
+<LI>\ref BDX_PBOX "Ring-to-PCIe interface counters"</LI>
+<LI>\ref BDX_RBOX "Ring-to-QPI interface counters"</LI>
 </UL>
 
 <H1>Counters available for each hardware thread</H1>
-\anchor HASEP_FIXED
+\anchor BDX_FIXED
 <H2>Fixed-purpose counters</H2>
 <P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
 <H3>Counter and events</H3>
@@ -64,9 +65,9 @@
 </TR>
 </TABLE>
 
-\anchor HASEP_PMC
+\anchor BDX_PMC
 <H2>General-purpose counters</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<P>Commonly the Intel® Broadwell EP/EN/EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
 <H3>Counter and events</H3>
 <TABLE>
 <TR>
@@ -143,7 +144,7 @@
 </TABLE>
 
 <H3>Special handling for events</H3>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Haswell microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied wit [...]
+<P>The Intel® Broadwell EP/EN/EX microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Broadwell EP/EN/EX microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can b [...]
 <TABLE>
 <TR>
   <TH>Option</TH>
@@ -155,20 +156,20 @@
   <TD>match0</TD>
   <TD>16 bit hex value</TD>
   <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
-  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/HSX</A>.</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/BDX">https://download.01.org/perfmon/BDX</A>.</TD>
 </TR>
 <TR>
   <TD>match1</TD>
   <TD>22 bit hex value</TD>
   <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
-  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/HSX</A>.</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/BDX">https://download.01.org/perfmon/BDX</A>.</TD>
 </TR>
 </TABLE>
-<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility. Although we can programm it from user-space, the results are always 0.</P>
+<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility for performance monitoring. Although we can program it from user-space, the results are always 0.</P>
 
-\anchor HASEP_THERMAL
+\anchor BDX_THERMAL
 <H2>Thermal counter</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides one register for the current core temperature.</P>
+<P>The Intel® Broadwell microarchitecture provides one register for the current core temperature.</P>
 <H3>Counter and events</H3>
 <TABLE>
 <TR>
@@ -182,9 +183,9 @@
 </TABLE>
 
 <H1>Counters available for one hardware thread per socket</H1>
-\anchor HASEP_POWER
+\anchor BDX_POWER
 <H2>Power counter</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<P>The Intel® Broadwell microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
 <H3>Counter and events</H3>
 <TABLE>
 <TR>
@@ -209,13 +210,17 @@
 </TR>
 </TABLE>
 
-
-\anchor HASEP_BBOX
-<H2>Home Agent counters</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the Home Agent (HA) in the Uncore. The description from Intel®:<BR>
-<I>Each HA is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the HA is responsible for ordering memory reads/writes, coming in from the modular Ring, to a given address such that the IMC (memory controller).
+\anchor BDX_UBOX
+<H2>Uncore management counters</H2>
+<P>The Intel® Broadwell EP/EN/EX microarchitecture provides measurements of the management box in the Uncore. The description from Intel®:<BR>
+<I>The UBox serves as the system configuration controller for the Intel® Xeon® Processor E5 and E7 v4 Product Families. In this capacity, the UBox acts as the central unit for a variety of functions:
+<UL>
+<LI>The master for reading and writing physically distributed registers across using the Message Channel.</LI>
+<LI>The UBox is the intermediary for interrupt traffic, receiving interrupts from the system and dispatching interrupts to the appropriate core.</LI>
+<LI>The UBox serves as the system lock master used when quiescing the platform (e.g., Intel® QPI bus lock).</LI>
+</UL>
 </I><BR>
-The Home Agent performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the HA. For systems where each socket has 12 or more cores, there are both HAs available. The name BBOX originates from the Nehalem EX Uncore monitoring where this functional unit is called BBOX.
+The Uncore management performance counters are exposed to the operating system through the MSR interface. The name UBOX originates from the Nehalem EX Uncore monitoring where those functional units are called UBOX.
 </P>
 <H3>Counter and events</H3>
 <TABLE>
@@ -224,29 +229,25 @@ The Home Agent performance counters are exposed to the operating system through
   <TH>Event name</TH>
 </TR>
 <TR>
-  <TD>BBOX<0,1>C0</TD>
-  <TD>*</TD>
-</TR>
-<TR>
-  <TD>BBOX<0,1>C1</TD>
+  <TD>UBOX0</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>BBOX<0,1>C2</TD>
+  <TD>UBOX1</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>BBOX<0,1>C3</TD>
-  <TD>*</TD>
+  <TD>UBOXFIX</TD>
+  <TD>UBOX_CLOCKTICKS</TD>
 </TR>
 </TABLE>
 
-<H3>Available Options</H3>
+<H3>Available Options (Only for UBOX<0,1> counters)</H3>
 <TABLE>
 <TR>
   <TH>Option</TH>
   <TH>Argument</TH>
-  <TH>Description</TH>
+  <TH>Operation</TH>
   <TH>Comment</TH>
 </TR>
 <TR>
@@ -257,8 +258,8 @@ The Home Agent performance counters are exposed to the operating system through
 </TR>
 <TR>
   <TD>threshold</TD>
-  <TD>8 bit hex value</TD>
-  <TD>Set bits 24-31 in config register</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
   <TD></TD>
 </TR>
 <TR>
@@ -267,28 +268,14 @@ The Home Agent performance counters are exposed to the operating system through
   <TD>Set bit 23 in config register</TD>
   <TD></TD>
 </TR>
-<TR>
-  <TD>opcode</TD>
-  <TD>6 bit hex value</TD>
-  <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
-  <TD></TD>
-</TR>
-<TR>
-  <TD>match0</TD>
-  <TD>46 bit hex address</TD>
-  <TD>Extract bits 6-31 and set bits 6-31 in PCI_UNC_HA_PMON_ADDRMATCH0 register of PCI device<BR>Extract bits 32-45 and set bits 0-13 in PCI_UNC_HA_PMON_ADDRMATCH1 register of PCI device</TD>
-  <TD></TD>
-</TR>
 </TABLE>
 
-\anchor HASEP_SBOX
-<H2>Ring-to-Ring interface counters</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture manages the socket internal traffic through ring-based networks. Depending on the system's configuration there are multiple rings in one socket. The SBOXes organizes the traffic between the rings. The description from Intel®:<BR>
-<I>The SBox manages the interface between the two Rings.<BR>
-The processor is composed of two independent rings connected via two sets of bi-directional buffered switches. Each set of bi-directional buffered switches is partitioned into two ingress/egress pairs. Further, each ingress/egress pair is associated with a ring stop on adjacent rings. This ring stop is termed an Sbo. The processor has up to 4 SBos depending on SKU. The Sbo can be simply thought of as a conduit for the ring, but must also help maintain ordering of traffic to ensure functi [...]
-</I><BR>
-The SBOX hardware performance counters are exposed to the operating system through the MSR interface. There are maximal four of those interfaces but not all must be present. The name SBOX originates from the Nehalem EX Uncore monitoring where the functional unit to the QPI network is called SBOX but it had a different duty..
-</P>
+\anchor BDX_CBOX
+<H2>Last level cache counters</H2>
+<P>The Intel® Broadwell microarchitecture provides measurements for the last level cache segments.The description from Intel®:<BR>
+<I>The LLC coherence engine (CBo) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a CBo via the ring interconnect. The CBo is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the
+LLC; generating snoops and collecting snoop responses from the local cores when the MESIF protocol requires it.
+</I></P>
 <H3>Counter and events</H3>
 <TABLE>
 <TR>
@@ -296,19 +283,19 @@ The SBOX hardware performance counters are exposed to the operating system throu
   <TH>Event name</TH>
 </TR>
 <TR>
-  <TD>SBOX<0-3>C0</TD>
+  <TD>CBOX<0-23>C0</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>SBOX<0-3>C1</TD>
+  <TD>CBOX<0-23>C1</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>SBOX<0-3>C2</TD>
+  <TD>CBOX<0-23>C2</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>SBOX<0-3>C3</TD>
+  <TD>CBOX<0-23>C3</TD>
   <TD>*</TD>
 </TR>
 </TABLE>
@@ -329,7 +316,7 @@ The SBOX hardware performance counters are exposed to the operating system throu
 <TR>
   <TD>threshold</TD>
   <TD>8 bit hex value</TD>
-  <TD>Set bits 24-31 in config register</TD>
+  <TD>Set bits 24-28 in config register</TD>
   <TD></TD>
 </TR>
 <TR>
@@ -340,18 +327,44 @@ The SBOX hardware performance counters are exposed to the operating system throu
 </TR>
 <TR>
   <TD>tid</TD>
-  <TD>N</TD>
-  <TD>Set bit 19 in config register</TD>
-  <TD>This option has no real effect because TID filtering can be activated but there is no possibility to specify the TID somewhere.</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 0-5 in MSR_UNC_C<0-23>_PMON_BOX_FILTER register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>state</TD>
+  <TD>7 bit hex value</TD>
+  <TD>Set bits 17-23 in MSR_UNC_C<0-23>_PMON_BOX_FILTER register</TD>
+  <TD>M': 0x40, D: 0x20, F: 0x10, M: 0x08, E: 0x04, S: 0x02, I: 0x01</TD>
+</TR>
+<TR>
+  <TD>nid</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Set bits 0-15 in MSR_UNC_C<0-23>_PMON_BOX_FILTER1 register</TD>
+  <TD>Note: Node 0 has value 0x0001</TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>9 bit hex value</TD>
+  <TD>Set bits 20-28 in MSR_UNC_C<0-23>_PMON_BOX_FILTER1 register</TD>
+  <TD>A list of valid opcodes can be found in the <A HREF="http://www.intel.ie/content/www/ie/en/processors/xeon/xeon-e5-v4-uncore-performance-monitoring.html">Intel® Xeon E5 v4 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>2 bit hex address</TD>
+  <TD>Set bits 30-31 in MSR_UNC_C<0-23>_PMON_BOX_FILTER1 register</TD>
+  <TD>See the <A HREF="http://www.intel.ie/content/www/ie/en/processors/xeon/xeon-e5-v4-uncore-performance-monitoring.html">Intel® Xeon E5 v4 Uncore Manual</A> for more information.</TD>
 </TR>
 </TABLE>
+<H3>Special handling for events</H3>
+<P>The Intel® Broadwell EP/EN/EX microarchitecture provides an event LLC_LOOKUP which can be filtered with the 'state' option. If no 'state' is set, LIKWID sets the state to 0x1F, the default value to measure all lookups.</P>
 
-\anchor HASEP_QBOX
-<H2>QPI interface counters</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the QPI Link layer (QPI) in the Uncore. The description from Intel®:<BR>
-<I>The Intel® QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface. As such, it shares responsibility with the CBo(s) as the Intel® QPI caching agent(s). It is responsible for converting CBo requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa. On Intel® Xeon processor  [...]
+\anchor BDX_BBOX
+<H2>Home Agent counters</H2>
+<P>The Intel® Broadwell EP/EN/EX microarchitecture provides measurements of the Home Agent (HA) in the Uncore. The description from Intel®:<BR>
+<I>Each HA is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the HA is responsible for ordering memory reads/writes, coming in from the modular Ring, to a given address such that the IMC (memory controller).
 </I><BR>
-The QPI hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the QPI. The actual amount of QBOX counters depend on the CPU core count of one socket. If your system has not all interfaces but interface 0 does not work, try the other ones. The QBOX was introduced for the Haswell EP microarchitecture, for older Uncore-aware architectures the QBOX and the SBOX are the same.
+The Home Agent performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the HA. The name BBOX originates from the Nehalem EX Uncore monitoring where this functional unit is called BBOX.
 </P>
 <H3>Counter and events</H3>
 <TABLE>
@@ -360,36 +373,24 @@ The QPI hardware performance counters are exposed to the operating system throug
   <TH>Event name</TH>
 </TR>
 <TR>
-  <TD>QBOX<0,1>C0</TD>
+  <TD>BBOX<0,1>C0</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>QBOX<0,1>C1</TD>
+  <TD>BBOX<0,1>C1</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>QBOX<0,1>C2</TD>
+  <TD>BBOX<0,1>C2</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>QBOX<0,1>C3</TD>
+  <TD>BBOX<0,1>C3</TD>
   <TD>*</TD>
 </TR>
-<TR>
-  <TD>QBOX<0,1>FIX0</TD>
-  <TD>QPI_RATE</TD>
-</TR>
-<TR>
-  <TD>QBOX<0,1>FIX1</TD>
-  <TD>QPI_RX_IDLE</TD>
-</TR>
-<TR>
-  <TD>QBOX<0,1>FIX2</TD>
-  <TD>QPI_RX_LLR</TD>
-</TR>
 </TABLE>
 
-<H3>Available Options (Only for QBOX<0,1>C<0,1,2,3> counters)</H3>
+<H3>Available Options</H3>
 <TABLE>
 <TR>
   <TH>Option</TH>
@@ -410,62 +411,32 @@ The QPI hardware performance counters are exposed to the operating system throug
   <TD></TD>
 </TR>
 <TR>
-  <TD>match0</TD>
-  <TD>32 bit hex address</TD>
-  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_RX_MATCH_0 register of PCI device</TD>
-  <TD>This option matches the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
-</TR>
-<TR>
-  <TD>match1</TD>
-  <TD>20 bit hex address</TD>
-  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_RX_MATCH_1 register of PCI device</TD>
-  <TD>This option matches the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
-</TR>
-<TR>
-  <TD>match2</TD>
-  <TD>32 bit hex address</TD>
-  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_TX_MATCH_0 register of PCI device</TD>
-  <TD>This option matches the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
-</TR>
-<TR>
-  <TD>match3</TD>
-  <TD>20 bit hex address</TD>
-  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_TX_MATCH_1 register of PCI device</TD>
-  <TD>This option matches the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
-</TR>
-<TR>
-  <TD>mask0</TD>
-  <TD>32 bit hex address</TD>
-  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_RX_MASK_0 register of PCI device</TD>
-  <TD>This option masks the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
-</TR>
-<TR>
-  <TD>mask1</TD>
-  <TD>20 bit hex address</TD>
-  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_RX_MASK_1 register of PCI device</TD>
-  <TD>This option masks the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
 </TR>
 <TR>
-  <TD>mask2</TD>
-  <TD>32 bit hex address</TD>
-  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_TX_MASK_0 register of PCI device</TD>
-  <TD>This option masks the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+  <TD>opcode</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+  <TD></TD>
 </TR>
 <TR>
-  <TD>mask3</TD>
-  <TD>20 bit hex address</TD>
-  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_TX_MASK_1 register of PCI device</TD>
-  <TD>This option masks the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+  <TD>match0</TD>
+  <TD>46 bit hex address</TD>
+  <TD>Extract bits 6-31 and set bits 6-31 in PCI_UNC_HA_PMON_ADDRMATCH0 register of PCI device<BR>Extract bits 32-45 and set bits 0-13 in PCI_UNC_HA_PMON_ADDRMATCH1 register of PCI device</TD>
+  <TD></TD>
 </TR>
 </TABLE>
 
-\anchor HASEP_CBOX
-<H2>Last Level cache counters</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the LLC coherency engine in the Uncore. The description from Intel®:<BR>
-<I>The LLC coherence engine (CBo) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a CBo via the ring interconnect. The CBo is responsible for managing data delivery
-from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC; generating snoops and collecting snoop responses from the local cores when the MESIF protocol requires it.
+\anchor BDX_WBOX
+<H2>Power control unit counters</H2>
+<P>The Intel® Broadwell EP/EN/EX microarchitecture provides measurements of the power control unit (PCU) in the Uncore. The description from Intel®:<BR>
+<I>The PCU is the primary Power Controller for the Intel® Xeon® Processor E5 and E7 v4 Product Families.<BR>
+The uncore implements a power control unit acting as a core/uncore power and thermal manager. It runs its firmware on an internal microcontroller and coordinates the socket’s power states.
 </I><BR>
-The LLC hardware performance counters are exposed to the operating system through the MSR interface. The maximal amount of supported coherency engines for the Intel® Haswell EP/EN/EX microarchitecture is 17. E7-8800 v2 systems have all 17 engines, the E5-2600 v2 only 10 of them and the E5-1600 v2 only 6. It may be possible that your systems does not have all CBOXes, LIKWID will skip the unavailable ones in the setup phase. The name CBOX originates from the Nehalem EX Uncore monitorin [...]
+The PCU performance counters are exposed to the operating system through the MSR interface. The name WBOX originates from the Nehalem EX Uncore monitoring where those functional units are called WBOX.
 </P>
 <H3>Counter and events</H3>
 <TABLE>
@@ -474,24 +445,32 @@ The LLC hardware performance counters are exposed to the operating system throug
   <TH>Event name</TH>
 </TR>
 <TR>
-  <TD>CBOX<0-17>C0</TD>
+  <TD>WBOX0</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>CBOX<0-17>C1</TD>
+  <TD>WBOX1</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>CBOX<0-17>C2</TD>
+  <TD>WBOX2</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>CBOX<0-17>C3</TD>
+  <TD>WBOX3</TD>
   <TD>*</TD>
 </TR>
+<TR>
+  <TD>WBOX0FIX</TD>
+  <TD>CORES_IN_C3</TD>
+</TR>
+<TR>
+  <TD>WBOX1FIX</TD>
+  <TD>CORES_IN_C6</TD>
+</TR>
 </TABLE>
 
-<H3>Available Options</H3>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
 <TABLE>
 <TR>
   <TH>Option</TH>
@@ -506,57 +485,49 @@ The LLC hardware performance counters are exposed to the operating system throug
   <TD></TD>
 </TR>
 <TR>
-  <TD>threshold</TD>
-  <TD>5 bit hex value</TD>
-  <TD>Set bits 24-28 in config register</TD>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
   <TD></TD>
 </TR>
 <TR>
-  <TD>tid</TD>
+  <TD>threshold</TD>
   <TD>5 bit hex value</TD>
-  <TD>Set bits 0-4 in MSR_UNC_C<0-17>_PMON_BOX_FILTER register</TD>
+  <TD>Set bits 24-28 in config register</TD>
   <TD></TD>
 </TR>
 <TR>
-  <TD>state</TD>
-  <TD>6 bit hex value</TD>
-  <TD>Set bits 17-22 in MSR_UNC_C<0-17>_PMON_BOX_FILTER register</TD>
-  <TD>M: 0x28, F: 0x10, M: 0x08, E: 0x04, S: 0x02, I: 0x01</TD>
+  <TD>occupancy</TD>
+  <TD>2 bit hex value</TD>
+  <TD>Set bit 14-15 in config register</TD>
+  <TD>Cores in C0: 0x1, in C3: 0x2, in C6: 0x3</TD>
 </TR>
 <TR>
-  <TD>nid</TD>
-  <TD>16 bit hex value</TD>
-  <TD>Set bits 0-15 in MSR_UNC_C<0-17>_PMON_BOX_FILTER1 register</TD>
-  <TD>Note: Node 0 has value 0x0001</TD>
+  <TD>occupancy_filter</TD>
+  <TD>32 bit hex value</TD>
+  <TD>Set bits 0-31 in MSR_UNC_PCU_PMON_BOX_FILTER register</TD>
+  <TD>Band0: bits 0-7, Band1: bits 8-15, Band2: bits 16-23, Band3: bits 24-31</TD>
 </TR>
 <TR>
-  <TD>opcode</TD>
-  <TD>9 bit hex value</TD>
-  <TD>Set bits 20-28 in MSR_UNC_C<0-17>_PMON_BOX_FILTER1 register</TD>
-  <TD>A list of valid opcodes can be found in the <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A>.</TD>
+  <TD>occupancy_edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 31 in config register</TD>
+  <TD></TD>
 </TR>
 <TR>
-  <TD>match0</TD>
-  <TD>2 bit hex address</TD>
-  <TD>Set bits 30-31 in MSR_UNC_C<0-17>_PMON_BOX_FILTER1 register</TD>
-  <TD>See the <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for more information.</TD>
+  <TD>occupancy_invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 30 in config register</TD>
+  <TD></TD>
 </TR>
 </TABLE>
 
-<H3>Special handling for events</H3>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides an event LLC_LOOKUP which can be filtered with the 'state' option. If no 'state' is set, LIKWID sets the state to 0x1F, the default value to measure all lookups.</P>
-
-\anchor HASEP_UBOX
-<H2>Uncore management counters</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the management box in the Uncore. The description from Intel®:<BR>
-<I>The UBox serves as the system configuration controller within the physical processor. In this capacity, the UBox acts as the central unit for a variety of functions:
-<UL>
-<LI>The master for reading and writing physically distributed registers across Intel® Xeon processor E5 v3 family using the Message Channel.</LI>
-<LI>The UBox is the intermediary for interrupt traffic, receiving interrupts from the system and dispatching interrupts to the appropriate core.</LI>
-<LI>The UBox serves as the system lock master used when quiescing the platform (e.g., Intel® QPI bus lock).</LI>
-</UL>
-</I><BR>
-The Uncore management performance counters are exposed to the operating system through the MSR interface. The name UBOX originates from the Nehalem EX Uncore monitoring where those functional units are called UBOX.
+\anchor BDX_IBOX
+<H2>IRP box counters</H2>
+<P>The Intel® Broadwell EP/EN/EX microarchitecture provides measurements of the IRP box in the Uncore. The description from Intel®:<BR>
+<I>IRP is responsible for maintaining coherency for IIO traffic that needs to be coherent (e.g. cross-socket P2P).
+</I>
+The IRP box counters are exposed to the operating system through the PCI interface. The IBOX was introduced with the Intel® IvyBridge EP/EN/EX microarchitecture.
 </P>
 <H3>Counter and events</H3>
 <TABLE>
@@ -565,20 +536,16 @@ The Uncore management performance counters are exposed to the operating system t
   <TH>Event name</TH>
 </TR>
 <TR>
-  <TD>UBOX0</TD>
+  <TD>IBOX<0,1>C0</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>UBOX1</TD>
+  <TD>IBOX<0,1>C1</TD>
   <TD>*</TD>
 </TR>
-<TR>
-  <TD>UBOXFIX</TD>
-  <TD>UBOX_CLOCKTICKS</TD>
-</TR>
 </TABLE>
 
-<H3>Available Options (Only for UBOX<0,1> counters)</H3>
+<H3>Available Options</H3>
 <TABLE>
 <TR>
   <TH>Option</TH>
@@ -593,19 +560,27 @@ The Uncore management performance counters are exposed to the operating system t
   <TD></TD>
 </TR>
 <TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
   <TD>threshold</TD>
-  <TD>5 bit hex value</TD>
-  <TD>Set bits 24-28 in config register</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
   <TD></TD>
 </TR>
 </TABLE>
 
-\anchor HASEP_WBOX
-<H2>Power control unit counters</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the power control unit (PCU) in the Uncore. The description from Intel®:<BR>
-<I>The PCU is the primary Power Controller for the Intel® Xeon processor E5 v3 family. Intel® Xeon processor E5 v3 family uncore implements a power control unit acting as a core/uncore power and thermal manager. It runs its firmware on an internal micro-controller and coordinates the socket’s power states.
+\anchor BDX_MBOX
+<H2>Memory controller counters</H2>
+<P>The Intel® Broadwell EP/EN/EX microarchitecture provides measurements of the integrated Memory Controllers (iMC) in the Uncore. The description from Intel®:<BR>
+<I>The Intel® Xeon® Processor E5 and E7 v4 Product Families integrated Memory Controller provides the interface to DRAM and communicates to the rest of the Uncore through the Home Agent (i.e. the IMC does not connect to the Ring).
+<BR>
+In conjunction with the HA, the memory controller also provides a variety of RAS features, such as ECC, lockstep, memory access retry, memory scrubbing, thermal throttling, mirroring, and rank sparing.
 </I><BR>
-The PCU performance counters are exposed to the operating system through the MSR interface. The name WBOX originates from the Nehalem EX Uncore monitoring where those functional units are called WBOX.
+The integrated Memory Controllers performance counters are exposed to the operating system through PCI interfaces. There may be two memory controllers in the system. There are 4 different PCI devices per memory controller, each handling 4 memory channels. Each channel has 4 different general-purpose counters and one fixed counter for the DRAM clock. The channels of the first memory controller are MBOX0-3, the four channels of the second memory controller (if available) are named MBOX4-7. [...]
 </P>
 <H3>Counter and events</H3>
 <TABLE>
@@ -614,32 +589,28 @@ The PCU performance counters are exposed to the operating system through the MSR
   <TH>Event name</TH>
 </TR>
 <TR>
-  <TD>WBOX0</TD>
+  <TD>MBOX<0-7>C0</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>WBOX1</TD>
+  <TD>MBOX<0-7>C1</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>WBOX2</TD>
+  <TD>MBOX<0-7>C2</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>WBOX3</TD>
+  <TD>MBOX<0-7>C3</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>WBOX0FIX</TD>
-  <TD>CORES_IN_C3</TD>
-</TR>
-<TR>
-  <TD>WBOX1FIX</TD>
-  <TD>CORES_IN_C6</TD>
+  <TD>MBOX<0-7>FIX</TD>
+  <TD>DRAM_CLOCKTICKS</TD>
 </TR>
 </TABLE>
 
-<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<H3>Available Options (Only for counter MBOX<0-7>C<0-3>)</H3>
 <TABLE>
 <TR>
   <TH>Option</TH>
@@ -654,44 +625,25 @@ The PCU performance counters are exposed to the operating system through the MSR
   <TD></TD>
 </TR>
 <TR>
-  <TD>threshold</TD>
-  <TD>5 bit hex value</TD>
-  <TD>Set bits 24-28 in config register</TD>
-  <TD></TD>
-</TR>
-<TR>
-  <TD>match0</TD>
-  <TD>32 bit hex value</TD>
-  <TD>Set bits 0-31 in MSR_UNC_PCU_PMON_BOX_FILTER register</TD>
-  <TD>Band0: bits 0-7, Band1: bits 8-15, Band2: bits 16-23, Band3: bits 24-31</TD>
-</TR>
-<TR>
-  <TD>occupancy</TD>
-  <TD>2 bit hex value</TD>
-  <TD>Set bit 14-15 in config register</TD>
-  <TD>Cores in C0: 0x1, in C3: 0x2, in C6: 0x3</TD>
-</TR>
-<TR>
-  <TD>occupancy_edgedetect</TD>
+  <TD>invert</TD>
   <TD>N</TD>
-  <TD>Set bit 31 in config register</TD>
+  <TD>Set bit 23 in config register</TD>
   <TD></TD>
 </TR>
 <TR>
-  <TD>occupancy_invert</TD>
-  <TD>N</TD>
-  <TD>Set bit 30 in config register</TD>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
   <TD></TD>
 </TR>
 </TABLE>
+Although the fixed-purpose registers (MBOX<0-7>FIX) have a bit to invert the counting stategy, it is not supported by LIKWID because the corresponding threshold bit range is missing. The documentation lists the invert bit but no threshold bit range.
 
-\anchor HASEP_IBOX
-<H2>IRP box counters</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the IRP box in the Uncore. The description from Intel®:<BR>
-<I>IRP is responsible for maintaining coherency for IIO traffic that needs to be coherent (e.g. cross-socket P2P).
-</I>
-
-The IRP box counters are exposed to the operating system through the PCI interface. The IBOX was introduced with the Intel® IvyBridge EP/EN/EX microarchitecture.
+\anchor BDX_PBOX
+<H2>Ring-to-PCIe counters</H2>
+<P>The Intel® Broadwell EP/EN/EX microarchitecture provides measurements of the Ring-to-PCIe (R2PCIe) interface in the Uncore. The description from Intel®:<BR>
+<I>R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe.</I><BR>
+The Ring-to-PCIe performance counters are exposed to the operating system through a PCI interface. Independent of the system's configuration, there is only one Ring-to-PCIe interface per CPU socket.
 </P>
 <H3>Counter and events</H3>
 <TABLE>
@@ -700,11 +652,19 @@ The IRP box counters are exposed to the operating system through the PCI interfa
   <TH>Event name</TH>
 </TR>
 <TR>
-  <TD>IBOX<0,1>C0</TD>
+  <TD>PBOX0</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>IBOX<0,1>C1</TD>
+  <TD>PBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX3</TD>
   <TD>*</TD>
 </TR>
 </TABLE>
@@ -724,6 +684,12 @@ The IRP box counters are exposed to the operating system through the PCI interfa
   <TD></TD>
 </TR>
 <TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
   <TD>threshold</TD>
   <TD>8 bit hex value</TD>
   <TD>Set bits 24-31 in config register</TD>
@@ -731,13 +697,13 @@ The IRP box counters are exposed to the operating system through the PCI interfa
 </TR>
 </TABLE>
 
-\anchor HASEP_MBOX
-<H2>Memory controller counters</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the integrated Memory Controllers (iMC) in the Uncore. The description from Intel®:<BR>
-<I>The Intel® Xeon processor E5 v3 family integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent (i.e. the IMC does not connect to the Ring).<BR>
-In conjunction with the HA, the memory controller also provides a variety of RAS features, such as ECC, lockstep, memory access retry, memory scrubbing, thermal throttling, mirroring, and rank sparing.
+\anchor BDX_SBOX
+<H2>Ring-to-Ring interface counters</H2>
+<P>The Intel® Broadwell EP/EN/EX microarchitecture manages the socket internal traffic through ring-based networks. Depending on the system's configuration there are multiple rings in one socket. The SBOXes organizes the traffic between the rings. The description from Intel®:<BR>
+<I>The SBox manages the interface between the two Rings.<BR>
+The processor is composed of two independent rings connected via two sets of bidirectional buffered switches. Each set of bidirectional buffered switches is partitioned into two ingress/egress pairs. Further, each ingress/egress pair is associated with a ring stop on adjacent rings. This ring stop is termed an Sbo. The processor has up to 4 SBos depending on SKU. The Sbo can be simply thought of as a conduit for the ring, but must also help maintain ordering of traffic to ensure function [...]
 </I><BR>
-The integrated Memory Controllers performance counters are exposed to the operating system through PCI interfaces. There may be two memory controllers in the system (E7-8800 v2). There are 4 different PCI devices per memory controller, each covering one memory channel. Each channel has 4 different general-purpose counters and one fixed counter for the DRAM clock. The four channels of the first memory controller are MBOX0-3, the four channels of the second memory controller (if available) [...]
+The SBOX hardware performance counters are exposed to the operating system through the MSR interface. There are maximal four of those interfaces but not all must be present. The name SBOX originates from the Nehalem EX Uncore monitoring where the functional unit to the QPI network is called SBOX but it had a different duty..
 </P>
 <H3>Counter and events</H3>
 <TABLE>
@@ -746,33 +712,28 @@ The integrated Memory Controllers performance counters are exposed to the operat
   <TH>Event name</TH>
 </TR>
 <TR>
-  <TD>MBOX<0-7>C0</TD>
+  <TD>SBOX<0-3>C0</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>MBOX<0-7>C1</TD>
+  <TD>SBOX<0-3>C1</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>MBOX<0-7>C2</TD>
+  <TD>SBOX<0-3>C2</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>MBOX<0-7>C3</TD>
+  <TD>SBOX<0-3>C3</TD>
   <TD>*</TD>
 </TR>
-<TR>
-  <TD>MBOX<0-7>FIX</TD>
-  <TD>DRAM_CLOCKTICKS</TD>
-</TR>
 </TABLE>
-
-<H3>Available Options (Only for counter MBOX<0-7>C<0-3>)</H3>
+<H3>Available Options</H3>
 <TABLE>
 <TR>
   <TH>Option</TH>
   <TH>Argument</TH>
-  <TH>Operation</TH>
+  <TH>Description</TH>
   <TH>Comment</TH>
 </TR>
 <TR>
@@ -787,15 +748,35 @@ The integrated Memory Controllers performance counters are exposed to the operat
   <TD>Set bits 24-31 in config register</TD>
   <TD></TD>
 </TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>tid</TD>
+  <TD>N</TD>
+  <TD>Set bit 19 in config register</TD>
+  <TD>This option has no real effect because TID filtering can be activated but there is no possibility to specify the TID somewhere.</TD>
+</TR>
 </TABLE>
 
-\anchor HASEP_RBOX
-<H2>Ring-to-QPI counters</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the Ring-to-QPI (R3QPI) interface in the Uncore. The description from Intel®:<BR>
-<I>R3QPI is the interface between the Intel® QPI Link Layer, which packetizes requests, and the Ring.<BR>
-R3QPI is the interface between the ring and the Intel® QPI Link Layer. It is responsible for translating between ring protocol packets and flits that are used for transmitting data across the Intel® QPI interface. It performs credit checking between the local Intel® QPI LL, the remote Intel® QPI LL and other agents on the local ring.
+\anchor BDX_QBOX
+<H2>QPI interface counters</H2>
+<P>The Intel® Broadwell EP/EN/EX microarchitecture provides measurements of the QPI Link layer (QPI) in the Uncore. The description from Intel®:<BR>
+<I>The Intel® QPI Link Layer is responsible for packeting requests from the caching agent
+on the way out to the system interface. As such, it shares responsibility with the CBo(s)
+as the Intel QPI caching agent(s). It is responsible for converting CBo requests to Intel
+QPI messages (i.e. snoop generation and data response messages from the snoop
+response) as well as converting/forwarding ring messages to Intel QPI packets and vice
+versa.<BR>
+On Intel® Xeon® Processor E5 and E7 v4 Product Families, Intel® QPI is split into two
+separate layers. The Intel® QPI LL (link layer) is responsible for generating,
+transmitting, and receiving packets with the Intel® QPI link.<BR>
+R3QPI (\ref BDX_RBOX) provides the interface to the Ring for the Link Layer. It is also the point where VNA/VN0 link credits are acquired.
 </I><BR>
-The Ring-to-QPI performance counters are exposed to the operating system through PCI interfaces. Since the RBOXes manage the traffic from the LLC-connecting ring interface on the socket with the QPI interfaces (SBOXes), the amount is similar to the amount of SBOXes. See at SBOXes how many are available for which system configuration. The name RBOX originates from the Nehalem EX Uncore monitoring where those functional units are called RBOX.
+The QPI hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the QPI. The actual amount of QBOX counters depend on the CPU core count of one socket. If your system has not all interfaces but interface 0 does not work, try the other ones. The QBOX was introduced for the Broadwell EP microarchitecture, for older Uncore-aware architectures the QBOX and the SBOX are the same.
 </P>
 <H3>Counter and events</H3>
 <TABLE>
@@ -804,25 +785,41 @@ The Ring-to-QPI performance counters are exposed to the operating system through
   <TH>Event name</TH>
 </TR>
 <TR>
-  <TD>RBOX<0,1,2>C0</TD>
+  <TD>QBOX<0-2>C0</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>RBOX<0,1,2>C1</TD>
+  <TD>QBOX<0-2>C1</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>RBOX<0,1,2>C2</TD>
+  <TD>QBOX<0-2>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>QBOX<0-2>C3</TD>
   <TD>*</TD>
 </TR>
+<TR>
+  <TD>QBOX<0-2>FIX0</TD>
+  <TD>QPI_RATE</TD>
+</TR>
+<TR>
+  <TD>QBOX<0-2>FIX1</TD>
+  <TD>QPI_RX_IDLE</TD>
+</TR>
+<TR>
+  <TD>QBOX<0-2>FIX2</TD>
+  <TD>QPI_RX_LLR</TD>
+</TR>
 </TABLE>
 
-<H3>Available Options</H3>
+<H3>Available Options (Only for QBOX<0-2>C<0,1,2,3> counters)</H3>
 <TABLE>
 <TR>
   <TH>Option</TH>
   <TH>Argument</TH>
-  <TH>Operation</TH>
+  <TH>Description</TH>
   <TH>Comment</TH>
 </TR>
 <TR>
@@ -832,19 +829,79 @@ The Ring-to-QPI performance counters are exposed to the operating system through
   <TD></TD>
 </TR>
 <TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
   <TD>threshold</TD>
   <TD>8 bit hex value</TD>
   <TD>Set bits 24-31 in config register</TD>
   <TD></TD>
 </TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the Q_Py_PCI_PMON_RX_PKT_MATCH0 register of PCI device</TD>
+  <TD>This option matches the receive side. Check <A HREF="http://www.intel.ie/content/www/ie/en/processors/xeon/xeon-e5-v4-uncore-performance-monitoring.html">Intel® Xeon E5 v4 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the Q_Py_PCI_PMON_RX_PKT_MATCH1 register of PCI device</TD>
+  <TD>This option matches the receive side. Check <A HREF="http://www.intel.ie/content/www/ie/en/processors/xeon/xeon-e5-v4-uncore-performance-monitoring.html">Intel® Xeon E5 v4 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>match2</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the Q_Py_PCI_PMON_TX_PKT_MATCH0 register of PCI device</TD>
+  <TD>This option matches the transmit side. Check <A HREF="http://www.intel.ie/content/www/ie/en/processors/xeon/xeon-e5-v4-uncore-performance-monitoring.html">Intel® Xeon E5 v4 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>match3</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the Q_Py_PCI_PMON_TX_PKT_MATCH1 register of PCI device</TD>
+  <TD>This option matches the transmit side. Check <A HREF="http://www.intel.ie/content/www/ie/en/processors/xeon/xeon-e5-v4-uncore-performance-monitoring.html">Intel® Xeon E5 v4 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the Q_Py_PCI_PMON_RX_PKT_MASK0 register of PCI device</TD>
+  <TD>This option masks the receive side. Check <A HREF="http://www.intel.ie/content/www/ie/en/processors/xeon/xeon-e5-v4-uncore-performance-monitoring.html">Intel® Xeon E5 v4 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>mask1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the Q_Py_PCI_PMON_RX_PKT_MASK1 register of PCI device</TD>
+  <TD>This option masks the receive side. Check <A HREF="http://www.intel.ie/content/www/ie/en/processors/xeon/xeon-e5-v4-uncore-performance-monitoring.html">Intel® Xeon E5 v4 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>mask2</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the Q_Py_PCI_PMON_TX_PKT_MASK0 register of PCI device</TD>
+  <TD>This option masks the transmit side. Check <A HREF="http://www.intel.ie/content/www/ie/en/processors/xeon/xeon-e5-v4-uncore-performance-monitoring.html">Intel® Xeon E5 v4 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>mask3</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the Q_Py_PCI_PMON_TX_PKT_MASK1 register of PCI device</TD>
+  <TD>This option masks the transmit side. Check <A HREF="http://www.intel.ie/content/www/ie/en/processors/xeon/xeon-e5-v4-uncore-performance-monitoring.html">Intel® Xeon E5 v4 Uncore Manual</A> for bit fields.</TD>
+</TR>
 </TABLE>
 
-\anchor HASEP_PBOX
-<H2>Ring-to-PCIe counters</H2>
-<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the Ring-to-PCIe (R2PCIe) interface in the Uncore. The description from Intel®:<BR>
-<I>R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe.
+\anchor BDX_RBOX
+<H2>Ring-to-QPI counters</H2>
+<P>The Intel® Broadwell EP/EN/EX microarchitecture provides measurements of the Ring-to-QPI (R3QPI) interface in the Uncore. The description from Intel®:<BR>
+<I>R3QPI is the interface between the Intel® QPI Link Layer, which packetizes requests,
+and the Ring.<BR>
+R3QPI is the interface between the ring and the Intel® QPI Link Layer. It is responsible
+for translating between ring protocol packets and flits that are used for transmitting
+data across the Intel® QPI interface. It performs credit checking between the local
+Intel® QPI LL, the remote Intel® QPI LL and other agents on the local ring.
+
 </I><BR>
-The Ring-to-PCIe performance counters are exposed to the operating system through a PCI interface. Independent of the system's configuration, there is only one Ring-to-PCIe interface per CPU socket.
+The Ring-to-QPI performance counters are exposed to the operating system through PCI interfaces. Since the RBOXes manage the traffic from the LLC-connecting ring interface on the socket with the QPI interfaces (SBOXes), the amount is similar to the amount of SBOXes. See at SBOXes how many are available for which system configuration. The name RBOX originates from the Nehalem EX Uncore monitoring where those functional units are called RBOX.
 </P>
 <H3>Counter and events</H3>
 <TABLE>
@@ -853,19 +910,15 @@ The Ring-to-PCIe performance counters are exposed to the operating system throug
   <TH>Event name</TH>
 </TR>
 <TR>
-  <TD>PBOX0</TD>
-  <TD>*</TD>
-</TR>
-<TR>
-  <TD>PBOX1</TD>
+  <TD>RBOX<0,1,2>C0</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>PBOX2</TD>
+  <TD>RBOX<0,1,2>C1</TD>
   <TD>*</TD>
 </TR>
 <TR>
-  <TD>PBOX3</TD>
+  <TD>RBOX<0,1,2>C2</TD>
   <TD>*</TD>
 </TR>
 </TABLE>
@@ -885,6 +938,12 @@ The Ring-to-PCIe performance counters are exposed to the operating system throug
   <TD></TD>
 </TR>
 <TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
   <TD>threshold</TD>
   <TD>8 bit hex value</TD>
   <TD>Set bits 24-31 in config register</TD>
@@ -892,5 +951,4 @@ The Ring-to-PCIe performance counters are exposed to the operating system throug
 </TR>
 </TABLE>
 
-
 */
diff --git a/doc/archs/haswell.md b/doc/archs/haswell.md
index 65836bd..c49591c 100644
--- a/doc/archs/haswell.md
+++ b/doc/archs/haswell.md
@@ -6,6 +6,8 @@
 <LI>\ref HAS_PMC "General-purpose counters"</LI>
 <LI>\ref HAS_THERMAL "Thermal counters"</LI>
 <LI>\ref HAS_POWER "Power measurement counters"</LI>
+<LI>\ref HAS_UBOX "Uncore global counters"</LI>
+<LI>\ref HAS_CBOX "Last level cache counters"</LI>
 </UL>
 
 <H1>Counters available for each hardware thread</H1>
@@ -198,6 +200,102 @@
 </TR>
 </TABLE>
 
+\anchor HAS_UBOX
+<H2>Uncore global counters</H2>
+<P>The Intel® Haswell microarchitecture provides measurements for the global uncore.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOXFIX</TD>
+  <TD>UNCORE_CLOCK</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HAS_CBOX
+<H2>Last level cache counters</H2>
+<P>The Intel® Haswell microarchitecture provides measurements for the last level cache segments.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-3>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-3>C1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
 */
 
 
diff --git a/doc/archs/haswellep.md b/doc/archs/haswellep.md
index 9368c54..44cc32a 100644
--- a/doc/archs/haswellep.md
+++ b/doc/archs/haswellep.md
@@ -413,49 +413,49 @@ The QPI hardware performance counters are exposed to the operating system throug
   <TD>match0</TD>
   <TD>32 bit hex address</TD>
   <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_RX_MATCH_0 register of PCI device</TD>
-  <TD>This option matches the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+  <TD>This option matches the receive side. Check <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-uncore-performance-monitoring.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
 </TR>
 <TR>
   <TD>match1</TD>
   <TD>20 bit hex address</TD>
   <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_RX_MATCH_1 register of PCI device</TD>
-  <TD>This option matches the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+  <TD>This option matches the receive side. Check <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-uncore-performance-monitoring.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
 </TR>
 <TR>
   <TD>match2</TD>
   <TD>32 bit hex address</TD>
   <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_TX_MATCH_0 register of PCI device</TD>
-  <TD>This option matches the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+  <TD>This option matches the transmit side. Check <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-uncore-performance-monitoring.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
 </TR>
 <TR>
   <TD>match3</TD>
   <TD>20 bit hex address</TD>
   <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_TX_MATCH_1 register of PCI device</TD>
-  <TD>This option matches the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+  <TD>This option matches the transmit side. Check <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-uncore-performance-monitoring.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
 </TR>
 <TR>
   <TD>mask0</TD>
   <TD>32 bit hex address</TD>
   <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_RX_MASK_0 register of PCI device</TD>
-  <TD>This option masks the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+  <TD>This option masks the receive side. Check <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-uncore-performance-monitoring.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
 </TR>
 <TR>
   <TD>mask1</TD>
   <TD>20 bit hex address</TD>
   <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_RX_MASK_1 register of PCI device</TD>
-  <TD>This option masks the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+  <TD>This option masks the receive side. Check <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-uncore-performance-monitoring.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
 </TR>
 <TR>
   <TD>mask2</TD>
   <TD>32 bit hex address</TD>
   <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_TX_MASK_0 register of PCI device</TD>
-  <TD>This option masks the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+  <TD>This option masks the transmit side. Check <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-uncore-performance-monitoring.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
 </TR>
 <TR>
   <TD>mask3</TD>
   <TD>20 bit hex address</TD>
   <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_TX_MASK_1 register of PCI device</TD>
-  <TD>This option masks the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+  <TD>This option masks the transmit side. Check <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-uncore-performance-monitoring.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
 </TR>
 </TABLE>
 
diff --git a/doc/archs/ivybridge.md b/doc/archs/ivybridge.md
index 3008475..615166c 100644
--- a/doc/archs/ivybridge.md
+++ b/doc/archs/ivybridge.md
@@ -6,6 +6,8 @@
 <LI>\ref IVB_PMC "General-purpose counters"</LI>
 <LI>\ref IVB_THERMAL "Thermal counters"</LI>
 <LI>\ref IVB_POWER "Power measurement counters"</LI>
+<LI>\ref IVB_UBOX "Uncore global counters"</LI>
+<LI>\ref IVB_CBOX "Last level cache counters"</LI>
 </UL>
 
 <H1>Counters available for each hardware thread</H1>
@@ -185,6 +187,103 @@
 </TR>
 </TABLE>
 <P>*) The PWR2 counter is often not implemented by Intel® IvyBridge systems</P>
+
+\anchor IVB_UBOX
+<H2>Uncore global counters</H2>
+<P>The Intel® IvyBridge microarchitecture provides measurements for the global uncore.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOXFIX</TD>
+  <TD>UNCORE_CLOCK</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVB_CBOX
+<H2>Last level cache counters</H2>
+<P>The Intel® IvyBridge microarchitecture provides measurements for the last level cache segments.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-3>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-3>C1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
 */
 
 
diff --git a/doc/archs/sandybridge.md b/doc/archs/sandybridge.md
index 385a724..55e941f 100644
--- a/doc/archs/sandybridge.md
+++ b/doc/archs/sandybridge.md
@@ -1,11 +1,13 @@
 /*! \page sandybridge Intel® SandyBridge
 
-<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<H1>Available performance monitors for the Intel® SandyBridge microarchitecture</H1>
 <UL>
 <LI>\ref SNB_FIXED "Fixed-purpose counters"</LI>
 <LI>\ref SNB_PMC "General-purpose counters"</LI>
 <LI>\ref SNB_THERMAL "Thermal counters"</LI>
 <LI>\ref SNB_POWER "Power measurement counters"</LI>
+<LI>\ref SNB_UBOX "Uncore global counters"</LI>
+<LI>\ref SNB_CBOX "Last level cache counters"</LI>
 </UL>
 
 <H1>Counters available for each hardware thread</H1>
@@ -186,4 +188,100 @@
 </TR>
 </TABLE>
 <P>*) The PWR2 counter is often not implemented by Intel® SandyBridge systems</P>
+
+\anchor SNB_UBOX
+<H2>Uncore global counters</H2>
+<P>The Intel® SandyBridge microarchitecture provides measurements for the global uncore.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOXFIX</TD>
+  <TD>UNCORE_CLOCK</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNB_CBOX
+<H2>Last level cache counters</H2>
+<P>The Intel® SandyBridge microarchitecture provides measurements for the last level cache segments.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-3>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-3>C1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
 */
diff --git a/doc/archs/haswell.md b/doc/archs/skylake.md
similarity index 63%
copy from doc/archs/haswell.md
copy to doc/archs/skylake.md
index 65836bd..4c81c15 100644
--- a/doc/archs/haswell.md
+++ b/doc/archs/skylake.md
@@ -1,15 +1,19 @@
-/*! \page haswell Intel® Haswell
+/*! \page skylake Intel® Skylake
 
-<H1>Available performance monitors for the Intel® Haswell microarchitecture</H1>
+<P>This page is valid for Skylake. The Skylake microarchitecture supports the UBOX and the CBOX Uncore devices.</P>
+
+<H1>Available performance monitors for the Intel® Skylake microarchitecture</H1>
 <UL>
-<LI>\ref HAS_FIXED "Fixed-purpose counters"</LI>
-<LI>\ref HAS_PMC "General-purpose counters"</LI>
-<LI>\ref HAS_THERMAL "Thermal counters"</LI>
-<LI>\ref HAS_POWER "Power measurement counters"</LI>
+<LI>\ref SKL_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref SKL_PMC "General-purpose counters"</LI>
+<LI>\ref SKL_THERMAL "Thermal counters"</LI>
+<LI>\ref SKL_POWER "Power measurement counters"</LI>
+<LI>\ref SKL_UBOX "Uncore global counters"</LI>
+<LI>\ref SKL_CBOX "Last level cache counters"</LI>
 </UL>
 
 <H1>Counters available for each hardware thread</H1>
-\anchor HAS_FIXED
+\anchor SKL_FIXED
 <H2>Fixed-purpose counters</H2>
 <P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
 <H3>Counter and events</H3>
@@ -53,9 +57,9 @@
 </TR>
 </TABLE>
 
-\anchor HAS_PMC
+\anchor SKL_PMC
 <H2>General-purpose counters</H2>
-<P>The Intel® Haswell microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<P>Commonly the Intel® Skylake microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
 <H3>Counter and events</H3>
 <TABLE>
 <TR>
@@ -132,7 +136,7 @@
 </TABLE>
 
 <H3>Special handling for events</H3>
-<P>The Intel® Haswell microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Haswell microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with the OFF [...]
+<P>The Intel® Skylake microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Skylake microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with the OFF [...]
 <TABLE>
 <TR>
   <TH>Option</TH>
@@ -143,21 +147,21 @@
 <TR>
   <TD>match0</TD>
   <TD>16 bit hex value</TD>
-  <TD>Input value masked with 0x8077 and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
-  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/HSW">https://download.01.org/perfmon/HSW</A>.</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SKL">https://download.01.org/perfmon/SKL</A>.</TD>
 </TR>
 <TR>
   <TD>match1</TD>
   <TD>22 bit hex value</TD>
   <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
-  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/HSW">https://download.01.org/perfmon/HSW</A>.</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SKL">https://download.01.org/perfmon/SKL</A>.</TD>
 </TR>
 </TABLE>
-<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility. Although we can program it from user-space, the results are always 0.</P>
+<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility for performance monitoring. Although we can program it from user-space, the results are always 0.</P>
 
-\anchor HAS_THERMAL
+\anchor SKL_THERMAL
 <H2>Thermal counter</H2>
-<P>The Intel® Haswell microarchitecture provides one register for the current core temperature.</P>
+<P>The Intel® Skylake microarchitecture provides one register for the current core temperature.</P>
 <H3>Counter and events</H3>
 <TABLE>
 <TR>
@@ -171,9 +175,9 @@
 </TABLE>
 
 <H1>Counters available for one hardware thread per socket</H1>
-\anchor HAS_POWER
+\anchor SKL_POWER
 <H2>Power counter</H2>
-<P>The Intel® Haswell microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<P>The Intel® Skylake microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
 <H3>Counter and events</H3>
 <TABLE>
 <TR>
@@ -198,6 +202,73 @@
 </TR>
 </TABLE>
 
-*/
+\anchor SKL_UBOX
+<H2>Uncore global counters</H2>
+<P>The Intel® Skylake microarchitecture provides measurements for the global uncore.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOXFIX</TD>
+  <TD>UNCORE_CLOCK</TD>
+</TR>
+</TABLE>
 
 
+\anchor SKL_CBOX
+<H2>Last level cache counters</H2>
+<P>The Intel® Skylake microarchitecture provides measurements for the last level cache segments.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-3>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-3>C1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+*/
diff --git a/doc/likwid-doxygen.md b/doc/likwid-doxygen.md
index 37d505e..32704f1 100644
--- a/doc/likwid-doxygen.md
+++ b/doc/likwid-doxygen.md
@@ -21,7 +21,7 @@ This is an effort to develop easy to use but yet powerful performance tools for
 - \ref likwid-memsweeper : A tool to cleanup ccNUMA domains and LLC caches to get a clean environment for benchmarks.
 - \ref likwid-bench : A benchmarking framework for streaming benchmark kernels written in assembly.
 - \ref likwid-genTopoCfg : A config file writer that gets system topology and writes them to file for faster LIKWID startup.
-<!-- - \ref likwid-features : A tool to toggle the prefetchers on Core 2 processors.-->
+- \ref likwid-features : A tool to toggle the prefetchers and print available CPU features.
 
 Wrapper scripts using the basic likwid tools:
 - \ref likwid-mpirun : A wrapper script enabling simple and flexible pinning of MPI and MPI/threaded hybrid applications. With integrated \ref likwid-perfctr support.
@@ -36,17 +36,18 @@ Optionally, a global configuration file \ref likwid.cfg can be given to modify s
 \section Library LIKWID Library
 \subsection C_Interface C/C++ Interface
 - \ref MarkerAPI
-- \ref AccessClient
+- \ref Access
 - \ref Config
 - \ref CPUTopology
 - \ref NumaTopology
 - \ref AffinityDomains
+- \ref CPUParse
 - \ref PerfMon
 - \ref PowerMon
 - \ref ThermalMon
 - \ref TimerMon
-- \ref Daemon
 - \ref MemSweep
+- \ref CpuFeatures
 
 \subsection Lua_Interface Lua Interface
 - \ref lua_Info
@@ -61,6 +62,7 @@ Optionally, a global configuration file \ref likwid.cfg can be given to modify s
 - \ref lua_ThermalInfo
 - \ref lua_Timer
 - \ref lua_MemSweep
+- \ref lua_cpuFeatures
 - \ref lua_Misc (Some functionality not provided by Lua natively)
 
 \subsection Fortran90_Interface Fortran90 Interface
@@ -84,6 +86,9 @@ Optionally, a global configuration file \ref likwid.cfg can be given to modify s
 - \subpage haswell
 - \subpage haswellep
 - \subpage broadwell
+- \subpage broadwelld
+- \subpage broadwellep
+- \subpage skylake
 
 \subsection Architectures_AMD AMD®
 - \subpage k8
@@ -126,7 +131,7 @@ All build products are generated in the directory ./TAG, where TAG is the compil
 \subsection config Configuration
 Usually the only thing you have to configure is the PREFIX install path in the build config file config.mk in the top directory.
 
-\subsubsection color Changing color of <CODE>likwid-pin</CODE> output
+\subsubsection color Changing color of likwid-pin output
 Depending on the background of your terminal window you can choose a color for <CODE>likwid-pin</CODE> output.
 
 \subsubsection accessD Usage of the access daemon likwid-accessD
@@ -175,8 +180,10 @@ NOTE: The pinning functionality and the daemons only work if configured in confi
 installed with <B>make install</B>. If you do not use the pinning functionality the tools
 can be used without installation.
 
- - <B>make install</B> - Installs the executables, libraries, man pages and headers to the path you configured in config.mk.
- - <B>make uninstall</B> - Delete all installed files.
+ - <B>make install</B> - Installs the executables, libraries, man pages and headers to the path you configured in config.mk (<CODE>PREFIX</CODE>).
+ - <B>make uninstall</B> - Delete all installed files under <CODE>PREFIX</CODE>.
+  - <B>make move</B> - Copy the executables, libraries, man pages and headers from <CODE>PREFIX</CODE> in config.mk to <CODE>INSTALLED_PREFIX</CODE>.
+ - <B>make uninstall_moved</B> - Delete all installed files under <CODE>INSTALLED_PREFIX</CODE>.
 
 \subsection accessD Setting up access for hardware performance monitoring
 Hardware performance monitoring on x86 is enabled using model-specific registers (MSR). MSR registers are special registers not part of the instruction set architecture. To read and write to these registers the x86 ISA provides special instructions. These instructions can only be executed in protected mode or in other words only kernel code can execute these instructions. Fortunately, any Linux kernel 2.6 or newer provides access to these registers via a set of device files. This allows  [...]
@@ -254,9 +261,18 @@ We would like to port LIKWID to other CPU architectures that support hardware pe
 \section faq10 Do you plan to introduce a graphical frontend for LIKWID?
 No, we do not!
 
-\section faq12 Why does the startup of likwid-perfctr take so long?
+\section faq11 Why does the startup of likwid-perfctr take so long?
 In order to get reliable time measurements, LIKWID must determine the base clock frequency of your CPU. This is done by a measurement loop that takes about 1 second. You can avoid the measurement loop by creating a topology configuration file with \ref likwid-genTopoCfg.
 
-\section faq13 I want to help, were do I start?
+\section faq12 What about the security issue found with the MSR device files (CVE-2013-0268)? Can someone use the access daemon to exploit this
+No it is not possible. At the current state, the access daemon only allows accesses to performance counter MSRs and not to MSRs like SYSENTER_EIP_MSR that are used in the exploit. Consequently, the access daemon cannot be used to exploit the security issue CVE-2013-0268.
+
+\section faq13 I get messages like "Counter register FOO not supported or PCI device not available", what does it mean?
+Every time an event set is added to LIKWID, it checks whether the registers are accessible. If not, such a message is printed. In most cases, this is not a failure, it is just informing you that the counter will be skipped later at the measurements. It happens more often with the predefined performance groups because they are created for a maximally equipped machine. If your system has e.g. less memory channels than the maximal possible, not all MBOX registers will work but the counts wi [...]
+
+\section faq14 Likwid reports something like 'ERROR: The selected register XYZ is in use'. What causes this and is the -f/--force option safe?
+Some time ago, Intel release a document Intel PMU sharing guide which lists some checks that should be performed before programming the hardware performance counters. Starting with commit 8efa0ec46a30438e1b85eac3ba31ebe0b7a03303 LIKWID now checks the counters and exits if one of the selected counters is in use. When you set on command line <CODE>-f</CODE> or <CODE>--force</CODE> or set the environment variable <CODE>LIKWID_FORCE</CODE>, LIKWID ignores the already running counter and clea [...]
+
+\section faq15 I want to help, were do I start?
 The best way is to talk to us at the <A HREF="http://groups.google.com/group/likwid-users">mailing list</A>. There are a bunch of small work packages on our ToDo list that can be used as a good starting point for learning how LIKWID works. If you are not a programmer but you have a good idea, let us know and we will discuss it.
 */
diff --git a/doc/lua-doxygen.md b/doc/lua-doxygen.md
index c00b992..e662a46 100644
--- a/doc/lua-doxygen.md
+++ b/doc/lua-doxygen.md
@@ -129,6 +129,29 @@ or<BR>
 </TR>
 </TABLE>
 
+\anchor setGroupPath
+<H2>setGroupPath(path)</H2>
+<P>Change the path to the performance group files.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a path</TD>
+      <TD>Path to group files</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
 \anchor putConfiguration
 <H2>putConfiguration()</H2>
 <P>Frees the C-structures that were created by \ref getConfiguration function.</P>
@@ -786,7 +809,7 @@ or<BR>
 </TR>
 <TR>
   <TD>Returns</TD>
-  <TD>NUMA Info \ref lua_affinityinfo</TD>
+  <TD>Affinity Info \ref lua_affinityinfo</TD>
 </TR>
 </TABLE>
 <H2>putAffinityInfo()</H2>
@@ -1006,6 +1029,37 @@ or<BR>
 </TR>
 </TABLE>
 
+\anchor lua_groupinfo
+<H2>Info about performance groups</H2>
+<P>This structure is returned by \ref getGroups function</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a Index</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>Name</TD>
+      <TD>Name of performance group</TD>
+    </TR>
+    <TR>
+      <TD>Info</TD>
+      <TD>Short description of the performance group</TD>
+    </TR>
+    <TR>
+      <TD>Long</TD>
+      <TD>Long description of the performance group</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+</TABLE>
+
 <H1>Function definitions for Lua performance monitoring module in the Lua API</H1>
 \anchor init
 <H2>init(nrThreads, thread2Cpus)</H2>
@@ -1176,6 +1230,24 @@ or<BR>
 </TR>
 </TABLE>
 
+\anchor finalize
+<H2>finalize()</H2>
+<P>Destroy internal structures and clean all used registers</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Always 0</TD>
+</TR>
+</TABLE>
+
 \anchor getResult
 <H2>getResult(groupID, eventID, threadID)</H2>
 <P>Get result for a group, event, thread combination. All options must be given</P>
@@ -1207,9 +1279,9 @@ or<BR>
 </TR>
 </TABLE>
 
-\anchor getResults
-<H2>getResults()</H2>
-<P>Get all results for all group, event, thread combinations</P>
+\anchor getLastResult
+<H2>getLastResult(groupID, eventID, threadID)</H2>
+<P>Get result for a group, event, thread combination of the last measurement cycle. All options must be given</P>
 <TABLE>
 <TR>
   <TH>Direction</TH>
@@ -1217,17 +1289,30 @@ or<BR>
 </TR>
 <TR>
   <TD>Input Parameter</TD>
-  <TD>None</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>Return result from group defined by \a groupID</TD>
+    </TR>
+    <TR>
+      <TD>\a eventID</TD>
+      <TD>Return result for event with \a eventID. Position in string given to \ref addEventSet function</TD>
+    </TR>
+    <TR>
+      <TD>\a threadID</TD>
+      <TD>Return result for thread with \a threadID as defined by the \a thread2Cpus input parameter for \ref init function</TD>
+    </TR>
+  </TABLE></TD>
 </TR>
 <TR>
   <TD>Returns</TD>
-  <TD>Three-dimensional list with results. First dim. is groups, second dim. is events and third dim. are the threads</TD>
+  <TD>Result</TD>
 </TR>
 </TABLE>
 
-\anchor getMarkerResults
-<H2>getMarkerResults(filename, group_list, num_cpus)</H2>
-<P>Get the results for an output file written by \ref MarkerAPI</P>
+\anchor getMetric
+<H2>getMetric(groupID, metricID, threadID)</H2>
+<P>Get the derived metric result for a group, metric, thread combination. All options must be given</P>
 <TABLE>
 <TR>
   <TH>Direction</TH>
@@ -1237,25 +1322,59 @@ or<BR>
   <TD>Input Parameter</TD>
   <TD><TABLE>
     <TR>
-      <TD>\a filename</TD>
-      <TD>Filename written by \ref MarkerAPI</TD>
+      <TD>\a groupID</TD>
+      <TD>Return result from group defined by \a groupID</TD>
+    </TR>
+    <TR>
+      <TD>\a metricID</TD>
+      <TD>Return result for metric with \a metricID.</TD>
+    </TR>
+    <TR>
+      <TD>\a threadID</TD>
+      <TD>Return result for thread with \a threadID as defined by the \a thread2Cpus input parameter for \ref init function</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Result</TD>
+</TR>
+</TABLE>
+
+\anchor getLastMetric
+<H2>getLastMetric(groupID, metricID, threadID)</H2>
+<P>Get the derived metric result for a group, metric, thread combination of the last measurement cycle. All options must be given</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>Return result from group defined by \a groupID</TD>
     </TR>
     <TR>
-      <TD>\a group_list</TD>
-      <TD>List of defined groups</TD>
+      <TD>\a eventID</TD>
+      <TD>Return result for event with \a metricID.</TD>
     </TR>
     <TR>
-      <TD>\a num_cpus</TD>
-      <TD>Amount of defined CPUs. Is used just used for checking if the \ref MarkerAPI run is valid. If LIKWID_MARKER_THREADINIT is not called properly the tests will fail</TD>
+      <TD>\a threadID</TD>
+      <TD>Return result for thread with \a threadID as defined by the \a thread2Cpus input parameter for \ref init function</TD>
     </TR>
   </TABLE></TD>
 </TR>
 <TR>
   <TD>Returns</TD>
-  <TD>Four-dimensional list with results. First dim. is groups, second dim. is management regions, and third dim. are the events and fourth dim. are the threads</TD>
+  <TD>Result</TD>
 </TR>
 </TABLE>
 
+
+
+
 \anchor getEventsAndCounters
 <H2>getEventsAndCounters()</H2>
 <P>Get a list containing all event and counter definitions</P>
@@ -1364,7 +1483,7 @@ or<BR>
   <TD><TABLE>
     <TR>
       <TD>\a groupID</TD>
-      <TD>Return the measurement time for group defined by \a groupID</TD>
+      <TD>Return the number of events in group defined by \a groupID</TD>
     </TR>
   </TABLE></TD>
 </TR>
@@ -1374,6 +1493,29 @@ or<BR>
 </TR>
 </TABLE>
 
+\anchor getNumberOfMetrics
+<H2>getNumberOfMetrics(groupID)</H2>
+<P>Returns the amount of metrics for the given groupID</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>Return the number of derived metrics for group defined by \a groupID</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Amount of metrics in group</TD>
+</TR>
+</TABLE>
+
 \anchor getNumberOfThreads
 <H2>getNumberOfThreads()</H2>
 <P>Returns the number of threads as given to \ref init function</P>
@@ -1392,9 +1534,9 @@ or<BR>
 </TR>
 </TABLE>
 
-\anchor get_groups
-<H2>get_groups()</H2>
-<P>Returns a list of all performance groups in \a groupfolder</P>
+\anchor getNameOfEvent
+<H2>getNameOfEvent(groupID, eventID)</H2>
+<P>Returns the name of an event in a configured event set</P>
 <TABLE>
 <TR>
   <TH>Direction</TH>
@@ -1402,26 +1544,53 @@ or<BR>
 </TR>
 <TR>
   <TD>Input Parameter</TD>
-  <TD>None</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>Return event name from group defined by \a groupID</TD>
+    </TR>
+    <TR>
+      <TD>\a eventID</TD>
+      <TD>Return event name for event with \a eventID.</TD>
+    </TR>
+  </TABLE></TD>
 </TR>
 <TR>
   <TD>Returns</TD>
+  <TD>Name of event</TD>
+</TR>
+</TABLE>
+
+\anchor getNameOfCounter
+<H2>getNameOfCounter(groupID, eventID)</H2>
+<P>Returns the name of counter in a configured event set</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
   <TD><TABLE>
     <TR>
-      <TD>\a numerOfGroups</TD>
-      <TD>Amount of groups in \a groupfolder for given \a architecture</TD>
+      <TD>\a groupID</TD>
+      <TD>Return counter name from group defined by \a groupID</TD>
     </TR>
     <TR>
-      <TD>\a groups</TD>
-      <TD>List with the names of all performance groups in \a groupfolder for given \a architecture</TD>
+      <TD>\a eventID</TD>
+      <TD>Return counter name for event with \a eventID.</TD>
     </TR>
   </TABLE></TD>
 </TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Name of counter</TD>
+</TR>
 </TABLE>
 
-\anchor get_groupdata
-<H2>get_groupdata(group)</H2>
-<P>Read in the performance group \a group</P>
+\anchor getNameOfMetric
+<H2>getNameOfMetric(groupID, metricID)</H2>
+<P>Returns the name of a derived metric in a configured performance group</P>
 <TABLE>
 <TR>
   <TH>Direction</TH>
@@ -1431,61 +1600,465 @@ or<BR>
   <TD>Input Parameter</TD>
   <TD><TABLE>
     <TR>
-      <TD>\a group</TD>
-      <TD>Get group data for \a group </TD>
+      <TD>\a groupID</TD>
+      <TD>Return metric name from group defined by \a groupID</TD>
+    </TR>
+    <TR>
+      <TD>\a metricID</TD>
+      <TD>Return metric name for event with \a metricID.</TD>
     </TR>
   </TABLE></TD>
 </TR>
 <TR>
   <TD>Returns</TD>
+  <TD>Name of derived metric</TD>
+</TR>
+</TABLE>
+
+\anchor getNameOfGroup
+<H2>getNameOfGroup(groupID)</H2>
+<P>Returns the name of a configured performance group or 'Custom' for own event sets</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
   <TD><TABLE>
     <TR>
-      <TD>\a groupdata</TD>
-      <TD>Structure with all group information found for the performance group \a group, see \ref lua_groupdata</TD>
+      <TD>\a groupID</TD>
+      <TD>Return name of group defined by \a groupID</TD>
     </TR>
   </TABLE></TD>
 </TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Name of group</TD>
+</TR>
 </TABLE>
 
-*/
+\anchor getShortInfoOfGroup
+<H2>getShortInfoOfGroup(groupID)</H2>
+<P>Returns the short info string of a configured performance group or 'Custom' for own event sets</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>Return short description of a group defined by \a groupID</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Short description of a group</TD>
+</TR>
+</TABLE>
 
-/*! \page lua_PowerInfo Power and Energy monitoring module
-<H1>Data type definition for Lua power and energy monitoring module in the Lua API</H1>
-\anchor lua_powerinfo
-<H2>Power Information</H2>
-<P>This structure is returned by \ref getPowerInfo function<BR>The nested list structure is almost similar to the C struct CpuTopology.</P>
+\anchor getLongInfoOfGroup
+<H2>getLongInfoOfGroup(groupID)</H2>
+<P>Returns the long info string of a configured performance group or 'Custom' for own event sets</P>
 <TABLE>
 <TR>
-  <TH>Membername</TH>
-  <TH>Comment</TH>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
 </TR>
 <TR>
-  <TD>\a hasRAPL</TD>
-  <TD>If set, the system supports power readings through the RAPL interface</TD>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>Return long description of a group defined by \a groupID</TD>
+    </TR>
+  </TABLE></TD>
 </TR>
 <TR>
-  <TD>\a baseFrequency</TD>
-  <TD>Nominal clock frequency of the system</TD>
+  <TD>Returns</TD>
+  <TD>Long description of a group</TD>
+</TR>
+</TABLE>
+
+\anchor getGroups
+<H2>getGroups()</H2>
+<P>Returns a list of all performance groups in \a groupfolder</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
 </TR>
 <TR>
-  <TD>\a minFrequency</TD>
-  <TD>Minimal supported clock frequency of the system</TD>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
 </TR>
 <TR>
-  <TD>\a powerUnit</TD>
-  <TD>Multiplier for power readings</TD>
+  <TD>Returns</TD>
+  <TD>List of performance groups, see \ref lua_groupinfo for structure</TD>
 </TR>
+</TABLE>
+
+\anchor get_groupdata
+<H2>get_groupdata(group)</H2>
+<P>Read in the performance group \a group</P>
+<TABLE>
 <TR>
-  <TD>\a timeUnit</TD>
-  <TD>Multiplier for time readings from RAPL</TD>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
 </TR>
 <TR>
-  <TD>\a turbo</TD>
-    <TD>
-    <TABLE>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
     <TR>
-      <TH>Membername</TH>
-      <TH>Comment</TH>
+      <TD>\a group</TD>
+      <TD>Get group data for \a group </TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupdata</TD>
+      <TD>Structure with all group information found for the performance group \a group, see \ref lua_groupdata</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor readMarkerFile
+<H2>readMarkerFile(filename)</H2>
+<P>Get the results for an output file written by \ref MarkerAPI</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a filename</TD>
+      <TD>Filename written by \ref MarkerAPI</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>No return value</TD>
+</TR>
+</TABLE>
+
+\anchor destroyMarkerFile
+<H2>destroyMarkerFile()</H2>
+<P>Destroy all results previously read in from the \ref MarkerAPI</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>No return value</TD>
+</TR>
+</TABLE>
+
+\anchor markerNumRegions
+<H2>markerNumRegions()</H2>
+<P>Get the number of regions defined in the file previously read in with \ref readMarkerFile</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Amount of regions</TD>
+</TR>
+</TABLE>
+
+\anchor markerRegionGroup
+<H2>markerRegionGroup(regionID)</H2>
+<P>Get the group ID of a region read in with \ref readMarkerFile</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a regionID</TD>
+      <TD>Region ID to get group ID from</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Group ID for the region</TD>
+</TR>
+</TABLE>
+
+\anchor markerRegionTag
+<H2>markerRegionTag(regionID)</H2>
+<P>Get the region name of a region read in with \ref readMarkerFile</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a regionID</TD>
+      <TD>Region ID to get the name from</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Region name</TD>
+</TR>
+</TABLE>
+
+\anchor markerRegionEvents
+<H2>markerRegionEvents(regionID)</H2>
+<P>Get the number of events of a region read in with \ref readMarkerFile</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a regionID</TD>
+      <TD>Region ID to get the event count from</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Number of events</TD>
+</TR>
+</TABLE>
+
+\anchor markerRegionThreads
+<H2>markerRegionThreads(regionID)</H2>
+<P>Get the number of thread participating in a region read in with \ref readMarkerFile</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a regionID</TD>
+      <TD>Region ID to get the thread count from</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Number of threads</TD>
+</TR>
+</TABLE>
+
+\anchor markerRegionCpulist
+<H2>markerRegionCpulist(regionID)</H2>
+<P>Get a list of CPUs participating in a region read in with \ref readMarkerFile</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a regionID</TD>
+      <TD>Region ID to get the thread count from</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>List with CPU IDs</TD>
+</TR>
+</TABLE>
+
+\anchor markerRegionTime
+<H2>markerRegionTime(regionID, threadID)</H2>
+<P>Get the accumulated measurement time for a region read in with \ref readMarkerFile</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a regionID</TD>
+      <TD>Region ID to get the time from</TD>
+    </TR>
+    <TR>
+      <TD>\a threadID</TD>
+      <TD>Thread ID to get the time from</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Measurement time</TD>
+</TR>
+</TABLE>
+
+\anchor markerRegionCount
+<H2>markerRegionCount(regionID, threadID)</H2>
+<P>Get the call count for a region read in with \ref readMarkerFile</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a regionID</TD>
+      <TD>Region ID to get the call count from</TD>
+    </TR>
+    <TR>
+      <TD>\a threadID</TD>
+      <TD>Thread ID to get the call count from</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Call count</TD>
+</TR>
+</TABLE>
+
+\anchor markerRegionResult
+<H2>markerRegionResult(regionID, eventID, threadID)</H2>
+<P>Get the result for a region and thread read in with \ref readMarkerFile</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a regionID</TD>
+      <TD>Region ID to get the result</TD>
+    </TR>
+    <TR>
+      <TD>\a eventID</TD>
+      <TD>Event ID to get the result</TD>
+    </TR>
+    <TR>
+      <TD>\a threadID</TD>
+      <TD>Thread ID to get the result</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Result</TD>
+</TR>
+</TABLE>
+
+\anchor markerRegionMetric
+<H2>markerRegionMetric(regionID, metricID, threadID)</H2>
+<P>Get the derived metric result for a region read in with \ref readMarkerFile</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a regionID</TD>
+      <TD>Region ID to get the derived metric result</TD>
+    </TR>
+    <TR>
+      <TD>\a metricID</TD>
+      <TD>Metric ID to get the derived metric result</TD>
+    </TR>
+    <TR>
+      <TD>\a threadID</TD>
+      <TD>Thread ID to get the derived metric result</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Metric result</TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_PowerInfo Power and Energy monitoring module
+<H1>Data type definition for Lua power and energy monitoring module in the Lua API</H1>
+\anchor lua_powerinfo
+<H2>Power Information</H2>
+<P>This structure is returned by \ref getPowerInfo function<BR>The nested list structure is almost similar to the C struct CpuTopology.</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a hasRAPL</TD>
+  <TD>If set, the system supports power readings through the RAPL interface</TD>
+</TR>
+<TR>
+  <TD>\a baseFrequency</TD>
+  <TD>Nominal clock frequency of the system</TD>
+</TR>
+<TR>
+  <TD>\a minFrequency</TD>
+  <TD>Minimal supported clock frequency of the system</TD>
+</TR>
+<TR>
+  <TD>\a powerUnit</TD>
+  <TD>Multiplier for power readings</TD>
+</TR>
+<TR>
+  <TD>\a timeUnit</TD>
+  <TD>Multiplier for time readings from RAPL</TD>
+</TR>
+<TR>
+  <TD>\a turbo</TD>
+    <TD>
+    <TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
     </TR>
     <TR>
       <TD>\a numSteps</TD>
@@ -1543,7 +2116,7 @@ or<BR>
         </TR>
         <TR>
           <TD>tdp</TD>
-          <TD>Thermal Design Power<BR>Only if supportInfo is set</TD>
+          <TD>Thermal Design Power<BR>Only if supportInfo is set<BR>Only if supportInfo is set</TD>
         </TR>
         <TR>
           <TD>minPower</TD>
@@ -1816,7 +2389,7 @@ or<BR>
 </TR>
 </TABLE>
 
-\anchor initTemp
+\anchor readTemp
 <H2>readTemp(cpuID)</H2>
 <P>Measure the temperature on given CPU</P>
 <TABLE>
@@ -1861,6 +2434,24 @@ or<BR>
 </TR>
 </TABLE>
 
+\anchor getCycleClock
+<H2>getCycleClock()</H2>
+<P>Returns the clock speed of the time stamp counter</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Cycle clock speed in Hz</TD>
+</TR>
+</TABLE>
+
 \anchor startClock
 <H2>startClock()</H2>
 <P>Start the TSC clock</P>
@@ -2098,6 +2689,65 @@ or<BR>
 </TR>
 </TABLE>
 
+\anchor waitpid
+<H2>waitpid(PID)</H2>
+<P>Wait until the state of the program referenced by PID has a changed state. Blocking.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a PID</TD>
+      <TD>PID to check status</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor catchSignal
+<H2>catchSignal()</H2>
+<P>Add signal handler for SIGINT</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor getSignalState
+<H2>getSignalState()</H2>
+<P>Check whether SIGINT signal was received</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Amount of received SIGINT signals</TD>
+</TR>
+</TABLE>
+
 
 \anchor setenv
 <H2>setenv(Name, Value)</H2>
@@ -2257,8 +2907,113 @@ or<BR>
   </TABLE></TD>
 </TR>
 </TABLE>
+
+
+
 */
 
+/*! \page lua_cpuFeatures Module to read and manipulate CPU features
+<H1>Data type definition for Lua output functions module in the Lua API</H1>
+<H1>Function definitions for Lua output functions module in the Lua API</H1>
+\anchor cpuFeaturesInit
+<H2>cpuFeaturesInit()</H2>
+<P>Initialize the internal structures to enable CPU features module</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor cpuFeaturesGet
+<H2>cpuFeaturesGet(cpuID, featID)</H2>
+<P>Get feature state</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>CPU to read feature state</TD>
+    </TR>
+    <TR>
+      <TD>\a featID</TD>
+      <TD>ID of a feature</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor cpuFeaturesEnable
+<H2>cpuFeaturesEnable(cpuID, featID)</H2>
+<P>Enable feature for CPU</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>CPU to enable the feature</TD>
+    </TR>
+    <TR>
+      <TD>\a featID</TD>
+      <TD>ID of a feature</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>0 for success, all others are erros, either by MSR access or invalid feature</TD>
+</TR>
+</TABLE>
+
+\anchor cpuFeaturesDisable
+<H2>cpuFeaturesDisable(cpuID, featID)</H2>
+<P>Disable feature for CPU</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>CPU to disable the feature</TD>
+    </TR>
+    <TR>
+      <TD>\a featID</TD>
+      <TD>ID of a feature</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>0 for success, all others are erros, either by MSR access or invalid feature</TD>
+</TR>
+</TABLE>
+
 
 /*! \page lua_InputOutput Input and output functions module
 <H1>Data type definition for Lua output functions module in the Lua API</H1>
@@ -2422,9 +3177,9 @@ The option 'n' takes an argument, specified by the ':'. If found the option argu
 </TR>
 </TABLE>
 
-\anchor printOutput
-<H2>printOutput(groups, results, groupData, cpulist)</H2>
-<P>Prints results</P>
+\anchor getResults
+<H2>getResults()</H2>
+<P>Get all results for all group, event, thread combinations</P>
 <TABLE>
 <TR>
   <TH>Direction</TH>
@@ -2432,34 +3187,72 @@ The option 'n' takes an argument, specified by the ':'. If found the option argu
 </TR>
 <TR>
   <TD>Input Parameter</TD>
-  <TD><TABLE>
-    <TR>
-      <TD>\a groups</TD>
-      <TD>List of groups for printing</TD>
-    </TR>
-    <TR>
-      <TD>\a results</TD>
-      <TD>List of results as returned by \ref getResults function</TD>
-    </TR>
-    <TR>
-      <TD>\a groupData</TD>
-      <TD>List of group data structures</TD>
-    </TR>
-    <TR>
-      <TD>\a cpulist</TD>
-      <TD>List of thread ID to CPU ID relations</TD>
-    </TR>
-  </TABLE></TD>
+  <TD>None</TD>
 </TR>
 <TR>
   <TD>Returns</TD>
+  <TD>Three-dimensional list with results. First dim. is groups, second dim. is events and third dim. are the threads</TD>
+</TR>
+</TABLE>
+
+\anchor getLastResults
+<H2>getLastResults()</H2>
+<P>Get the results of the last measurement cycle for all group, event, thread combinations</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
   <TD>None</TD>
 </TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Three-dimensional list with results. First dim. is groups, second dim. is events and third dim. are the threads</TD>
+</TR>
 </TABLE>
 
-\anchor print_markerOutput
-<H2>print_markerOutput(groups, results, groupData, cpulist)</H2>
-<P>Prints results of a Marker API run. This is different to \ref printOutput because we have to resolve the measurement regions</P>
+\anchor getMetrics
+<H2>getMetrics()</H2>
+<P>Get all derived metric results for all group, metric, thread combinations</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Three-dimensional list with derived metric results. First dim. is groups, second dim. is metrics and third dim. are the threads</TD>
+</TR>
+</TABLE>
+
+\anchor getLastMetrics
+<H2>getLastMetrics()</H2>
+<P>Get the derived metric results of the last measurement cycle for all group, metric, thread combinations</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Three-dimensional list with derived metric results. First dim. is groups, second dim. is metrics and third dim. are the threads</TD>
+</TR>
+</TABLE>
+
+
+\anchor printOutput
+<H2>printOutput(results, metrics, cpulist, region, stats)</H2>
+<P>Prints results</P>
 <TABLE>
 <TR>
   <TH>Direction</TH>
@@ -2469,21 +3262,25 @@ The option 'n' takes an argument, specified by the ':'. If found the option argu
   <TD>Input Parameter</TD>
   <TD><TABLE>
     <TR>
-      <TD>\a groups</TD>
-      <TD>List of groups for printing</TD>
-    </TR>
-    <TR>
       <TD>\a results</TD>
-      <TD>List of results as returned by \ref getMarkerResults function</TD>
+      <TD>List of results with format list[ngroups][nevents][nthreads]</TD>
     </TR>
     <TR>
-      <TD>\a groupData</TD>
-      <TD>List of group data structures</TD>
+      <TD>\a metrics</TD>
+      <TD>List of metric results with format list[ngroups][nmetrics][nthreads]</TD>
     </TR>
     <TR>
       <TD>\a cpulist</TD>
       <TD>List of thread ID to CPU ID relations</TD>
     </TR>
+    <TR>
+      <TD>\a region</TD>
+      <TD>Name of region or 'nil' for no region</TD>
+    </TR>
+    <TR>
+      <TD>\a stats</TD>
+      <TD>Print statistics table for one CPU</TD>
+    </TR>
   </TABLE></TD>
 </TR>
 <TR>
@@ -2493,6 +3290,7 @@ The option 'n' takes an argument, specified by the ':'. If found the option argu
 </TABLE>
 
 
+
 \anchor addSimpleAsciiBox
 <H2>addSimpleAsciiBox(container, lineIdx, colIdx, label)</H2>
 <P>Add a simple ASCII box with given label to box container. This function is only used by \ref likwid-topology</P>
diff --git a/examples/C-likwidAPI.c b/examples/C-likwidAPI.c
index aa6ed4e..fcdd13e 100644
--- a/examples/C-likwidAPI.c
+++ b/examples/C-likwidAPI.c
@@ -6,7 +6,7 @@
  *      Description:  Example how to use the LIKWID API in C/C++ applications
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/examples/C-markerAPI.c b/examples/C-markerAPI.c
index 84f97a4..fec66c1 100644
--- a/examples/C-markerAPI.c
+++ b/examples/C-markerAPI.c
@@ -6,7 +6,7 @@
  *      Description:  Example how to use the C/C++ Marker API
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/examples/F-markerAPI.F90 b/examples/F-markerAPI.F90
index 5e2ff4b..3cafd52 100644
--- a/examples/F-markerAPI.F90
+++ b/examples/F-markerAPI.F90
@@ -5,7 +5,7 @@
 !      Description:  Example how to use the Fortran90 Marker API
 !
 !      Version:   4.1
-!      Released:  19.5.2016
+!      Released:  8.8.2016
 !
 !      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
 !      Project:  likwid
diff --git a/examples/Lua-likwidAPI.lua b/examples/Lua-likwidAPI.lua
index a77cdb8..f5c9fc3 100644
--- a/examples/Lua-likwidAPI.lua
+++ b/examples/Lua-likwidAPI.lua
@@ -8,7 +8,7 @@
  *      Description:  Example how to use the LIKWID API in Lua scripts
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/ext/hwloc/include/private/autogen/config.h b/ext/hwloc/include/private/autogen/config.h
index 966fa78..e220d4f 100644
--- a/ext/hwloc/include/private/autogen/config.h
+++ b/ext/hwloc/include/private/autogen/config.h
@@ -330,7 +330,7 @@
 #define HAVE_SYNC_BUILTINS 1
 
 /* Define to '1' if sysctl is present and usable */
-#define HAVE_SYSCTL 1
+/* #undef HAVE_SYSCTL */
 
 /* Define to '1' if sysctlbyname is present and usable */
 /* #undef HAVE_SYSCTLBYNAME */
@@ -359,7 +359,7 @@
 #define HAVE_SYS_STAT_H 1
 
 /* Define to 1 if you have the <sys/sysctl.h> header file. */
-#define HAVE_SYS_SYSCTL_H 1
+/* #undef HAVE_SYS_SYSCTL_H */
 
 /* Define to 1 if you have the <sys/time.h> header file. */
 #define HAVE_SYS_TIME_H 1
diff --git a/groups/broadwell/DATA.txt b/groups/broadwell/DATA.txt
index 967cbad..6955eb7 100644
--- a/groups/broadwell/DATA.txt
+++ b/groups/broadwell/DATA.txt
@@ -4,8 +4,8 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  MEM_UOPS_RETIRED_LOADS
-PMC1  MEM_UOPS_RETIRED_STORES
+PMC0  MEM_UOPS_RETIRED_LOADS_ALL
+PMC1  MEM_UOPS_RETIRED_STORES_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -16,7 +16,7 @@ Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+Load to store ratio = MEM_UOPS_RETIRED_LOADS_ALL/MEM_UOPS_RETIRED_STORES_ALL
 -
 This is a metric to determine your load to store ratio.
 
diff --git a/groups/broadwellD/DATA.txt b/groups/broadwellD/DATA.txt
index 967cbad..6955eb7 100644
--- a/groups/broadwellD/DATA.txt
+++ b/groups/broadwellD/DATA.txt
@@ -4,8 +4,8 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  MEM_UOPS_RETIRED_LOADS
-PMC1  MEM_UOPS_RETIRED_STORES
+PMC0  MEM_UOPS_RETIRED_LOADS_ALL
+PMC1  MEM_UOPS_RETIRED_STORES_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -16,7 +16,7 @@ Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+Load to store ratio = MEM_UOPS_RETIRED_LOADS_ALL/MEM_UOPS_RETIRED_STORES_ALL
 -
 This is a metric to determine your load to store ratio.
 
diff --git a/groups/broadwellEP/DATA.txt b/groups/broadwellEP/DATA.txt
index 967cbad..6955eb7 100644
--- a/groups/broadwellEP/DATA.txt
+++ b/groups/broadwellEP/DATA.txt
@@ -4,8 +4,8 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  MEM_UOPS_RETIRED_LOADS
-PMC1  MEM_UOPS_RETIRED_STORES
+PMC0  MEM_UOPS_RETIRED_LOADS_ALL
+PMC1  MEM_UOPS_RETIRED_STORES_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -16,7 +16,7 @@ Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+Load to store ratio = MEM_UOPS_RETIRED_LOADS_ALL/MEM_UOPS_RETIRED_STORES_ALL
 -
 This is a metric to determine your load to store ratio.
 
diff --git a/groups/broadwellEP/FALSE_SHARE.txt b/groups/broadwellEP/FALSE_SHARE.txt
index 9f8a30e..1a2fd70 100644
--- a/groups/broadwellEP/FALSE_SHARE.txt
+++ b/groups/broadwellEP/FALSE_SHARE.txt
@@ -5,6 +5,7 @@ FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC1 MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM
 PMC2 MEM_UOPS_RETIRED_LOADS_ALL
 
 METRICS
diff --git a/groups/goldmont/BRANCH.txt b/groups/goldmont/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/goldmont/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/silvermont/ENERGY.txt b/groups/goldmont/CLOCK.txt
similarity index 73%
copy from groups/silvermont/ENERGY.txt
copy to groups/goldmont/CLOCK.txt
index d0996b3..088a776 100644
--- a/groups/silvermont/ENERGY.txt
+++ b/groups/goldmont/CLOCK.txt
@@ -4,25 +4,19 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
-PWR1  PWR_PP0_ENERGY
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Temperature [C]  TMP0
 Energy [J]  PWR0
 Power [W] PWR0/time
-Energy PP0 [J]  PWR1
-Power PP0 [W] PWR1/time
 
 LONG
 Formula:
-Power = PWR_PKG_ENERGY / time
-Power PP0 = PWR_PKG_ENERGY / time
+Power =  PWR_PKG_ENERGY / time
 -
 Silvermont implements the new RAPL interface. This interface enables to
 monitor the consumed energy on the package (socket) level.
diff --git a/groups/broadwell/DATA.txt b/groups/goldmont/DATA.txt
similarity index 71%
copy from groups/broadwell/DATA.txt
copy to groups/goldmont/DATA.txt
index 967cbad..61a915b 100644
--- a/groups/broadwell/DATA.txt
+++ b/groups/goldmont/DATA.txt
@@ -4,8 +4,8 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  MEM_UOPS_RETIRED_LOADS
-PMC1  MEM_UOPS_RETIRED_STORES
+PMC0  MEM_UOPS_RETIRED_ALL_LOADS
+PMC1  MEM_UOPS_RETIRED_ALL_STORES
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -16,7 +16,7 @@ Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+Load to store ratio = MEM_UOPS_RETIRED_ALL_LOADS/MEM_UOPS_RETIRED_ALL_STORES
 -
 This is a metric to determine your load to store ratio.
 
diff --git a/groups/silvermont/ENERGY.txt b/groups/goldmont/ENERGY.txt
similarity index 71%
copy from groups/silvermont/ENERGY.txt
copy to groups/goldmont/ENERGY.txt
index d0996b3..b94dd6a 100644
--- a/groups/silvermont/ENERGY.txt
+++ b/groups/goldmont/ENERGY.txt
@@ -7,6 +7,7 @@ FIXC2 CPU_CLK_UNHALTED_REF
 TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
 PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -18,12 +19,15 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 Energy PP0 [J]  PWR1
 Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR1
+Power DRAM [W] PWR1/time
 
 LONG
 Formula:
 Power = PWR_PKG_ENERGY / time
-Power PP0 = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
 -
-Silvermont implements the new RAPL interface. This interface enables to
+Goldmont implements the new RAPL interface. This interface enables to
 monitor the consumed energy on the package (socket) level.
 
diff --git a/groups/goldmont/ICACHE.txt b/groups/goldmont/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/goldmont/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/goldmont/L2CACHE.txt b/groups/goldmont/L2CACHE.txt
new file mode 100644
index 0000000..32a1545
--- /dev/null
+++ b/groups/goldmont/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  LONGEST_LAT_CACHE_REFERENCE
+PMC1  LONGEST_LAT_CACHE_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = LONGEST_LAT_CACHE_REFERENCE/INSTR_RETIRED_ANY
+L2 miss rate = LONGEST_LAT_CACHE_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = LONGEST_LAT_CACHE_MISS/LONGEST_LAT_CACHE_REFERENCE
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache
+reuse.
+
diff --git a/groups/goldmont/TLB_DATA.txt b/groups/goldmont/TLB_DATA.txt
new file mode 100644
index 0000000..b4679e5
--- /dev/null
+++ b/groups/goldmont/TLB_DATA.txt
@@ -0,0 +1,27 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  PAGE_WALKS_D_SIDE_COUNT
+PMC1  PAGE_WALKS_D_SIDE_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB misses     PMC0
+L1 DTLB miss rate  PMC0/FIXC0
+L1 DTLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 DTLB misses = PAGE_WALKS_D_SIDE_COUNT
+L1 DTLB miss rate = PAGE_WALKS_D_SIDE_COUNT / INSTR_RETIRED_ANY
+L1 DTLB miss duration [Cyc] = PAGE_WALKS_D_SIDE_CYCLES / PAGE_WALKS_D_SIDE_COUNT
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/goldmont/TLB_INSTR.txt b/groups/goldmont/TLB_INSTR.txt
new file mode 100644
index 0000000..30dce1e
--- /dev/null
+++ b/groups/goldmont/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  PAGE_WALKS_I_SIDE_COUNT
+PMC1  PAGE_WALKS_I_SIDE_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = PAGE_WALKS_I_SIDE_COUNT
+L1 ITLB miss rate = PAGE_WALKS_I_SIDE_COUNT / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = PAGE_WALKS_I_SIDE_CYCLES / PAGE_WALKS_I_SIDE_COUNT
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/ivybridgeEP/UNCORECLOCK.txt b/groups/ivybridgeEP/UNCORECLOCK.txt
index fef0d36..f8859fe 100644
--- a/groups/ivybridgeEP/UNCORECLOCK.txt
+++ b/groups/ivybridgeEP/UNCORECLOCK.txt
@@ -37,7 +37,7 @@ PBOX0 PBOX_CLOCKTICKS
 RBOX0C0 RBOX_CLOCKTICKS
 RBOX1C0 RBOX_CLOCKTICKS
 RBOX2C0 RBOX_CLOCKTICKS
-IBOX0 IBOX_CLOCKTICKS
+IBOX0C0 IBOX_CLOCKTICKS
 
 METRICS
 Runtime (RDTSC) [s] time
diff --git a/groups/silvermont/ENERGY.txt b/groups/silvermont/ENERGY.txt
index d0996b3..96ede02 100644
--- a/groups/silvermont/ENERGY.txt
+++ b/groups/silvermont/ENERGY.txt
@@ -22,7 +22,7 @@ Power PP0 [W] PWR1/time
 LONG
 Formula:
 Power = PWR_PKG_ENERGY / time
-Power PP0 = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
 -
 Silvermont implements the new RAPL interface. This interface enables to
 monitor the consumed energy on the package (socket) level.
diff --git a/perl/set_license.pl b/perl/set_license.pl
index b14801d..88155a2 100755
--- a/perl/set_license.pl
+++ b/perl/set_license.pl
@@ -13,12 +13,13 @@ my $lc = ' *';
 #my $VERSION   = '<VERSION>';
 #my $DATE   = '<DATE>';
 my $VERSION   = '4.1';
-my $DATE   = '19.5.2016';
+my $DATE   = '8.8.2016';
 my $YEAR  = '2016';
 my $AUTHOR = 'RRZE, University Erlangen-Nuremberg';
 my $LICENSE = 'gpl';
 
-my @SKIPLIST = ('ghash.c','ghash.h','loadData.S','bstrlib.c','bstrlib.h', 'calculator_stack.h', 'calculator_stack.c');
+my @SKIPLIST = ('ghash.c','ghash.h','loadData.S','bstrlib.c','bstrlib.h',
+    'calculator_stack.h', 'calculator_stack.c', 'calculator.c');
 
 sub print_copyright
 {
diff --git a/src/access-daemon/Makefile b/src/access-daemon/Makefile
index 5af6941..0bb2818 100644
--- a/src/access-daemon/Makefile
+++ b/src/access-daemon/Makefile
@@ -5,7 +5,7 @@
 #      Description:  accessDaemon Makefile
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
diff --git a/src/access-daemon/accessDaemon.c b/src/access-daemon/accessDaemon.c
index ee875fb..5c48688 100644
--- a/src/access-daemon/accessDaemon.c
+++ b/src/access-daemon/accessDaemon.c
@@ -6,7 +6,7 @@
  *      Description:  Implementation of access daemon.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Authors:  Michael Meier, michael.meier at rrze.fau.de
  *                Jan Treibig (jt), jan.treibig at gmail.com,
diff --git a/src/access-daemon/setFreq.c b/src/access-daemon/setFreq.c
index 6802449..43adc74 100644
--- a/src/access-daemon/setFreq.c
+++ b/src/access-daemon/setFreq.c
@@ -6,7 +6,7 @@
  *      Description:  Implementation of frequency daemon
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/access.c b/src/access.c
index 1102909..350cd9d 100644
--- a/src/access.c
+++ b/src/access.c
@@ -6,7 +6,7 @@
  *      Description:  Interface for the different register access modules.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/access_client.c b/src/access_client.c
index 93623f0..62a7e7c 100644
--- a/src/access_client.c
+++ b/src/access_client.c
@@ -1,3 +1,32 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  access_client.c
+ *
+ *      Description:  Interface to the access daemon for the access module.
+ *
+ *      Version:   4.1
+ *      Released:  8.8.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdint.h>
@@ -10,6 +39,7 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 #include <pthread.h>
+#include <sys/syscall.h>
 
 #include <types.h>
 #include <error.h>
@@ -19,8 +49,11 @@
 #include <configuration.h>
 #include <affinity.h>
 
+#define gettid() syscall(SYS_gettid)
+
 /* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
 static int globalSocket = -1;
+static pid_t masterPid = 0;
 static int cpuSockets_open = 0;
 static int cpuSockets[MAX_NUM_THREADS] = { [0 ... MAX_NUM_THREADS-1] = -1};
 static pthread_mutex_t globalLock = PTHREAD_MUTEX_INITIALIZER;
@@ -158,6 +191,10 @@ access_client_startDaemon(int cpu_id)
 int access_client_init(int cpu_id)
 {
     int ret = 0;
+    if (masterPid != 0 && gettid() == masterPid)
+    {
+        return 0;
+    }
     if (cpuSockets[cpu_id] < 0)
     {
         pthread_mutex_lock(&cpuLocks[cpu_id]);
@@ -168,6 +205,7 @@ int access_client_init(int cpu_id)
         {
             pthread_mutex_lock(&globalLock);
             globalSocket = cpuSockets[cpu_id];
+            masterPid = gettid();
             pthread_mutex_unlock(&globalLock);
         }
     }
@@ -188,6 +226,14 @@ int access_client_read(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint6
         return -ENOENT;
     }
 
+    if (cpuSockets[cpu_id] < 0 && gettid() != masterPid)
+    {
+        pthread_mutex_lock(&cpuLocks[cpu_id]);
+        cpuSockets[cpu_id] = access_client_startDaemon(cpu_id);
+        cpuSockets_open++;
+        pthread_mutex_unlock(&cpuLocks[cpu_id]);
+    }
+
     if ((cpuSockets[cpu_id] >= 0) && (cpuSockets[cpu_id] != globalSocket))
     {
         socket = cpuSockets[cpu_id];
@@ -249,6 +295,14 @@ int access_client_write(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint
         return -ENOENT;
     }
 
+    if (cpuSockets[cpu_id] < 0 && gettid() != masterPid)
+    {
+        pthread_mutex_lock(&cpuLocks[cpu_id]);
+        cpuSockets[cpu_id] = access_client_startDaemon(cpu_id);
+        cpuSockets_open++;
+        pthread_mutex_unlock(&cpuLocks[cpu_id]);
+    }
+
     if ((cpuSockets[cpu_id] >= 0) && (cpuSockets[cpu_id] != socket))
     {
         socket = cpuSockets[cpu_id];
diff --git a/src/access_x86.c b/src/access_x86.c
index 4cda3a7..dfb3ed5 100644
--- a/src/access_x86.c
+++ b/src/access_x86.c
@@ -1,3 +1,32 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  access_x86.c
+ *
+ *      Description:  Interface to x86 related functions for the access module.
+ *
+ *      Version:   4.1
+ *      Released:  8.8.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdint.h>
diff --git a/src/access_x86_msr.c b/src/access_x86_msr.c
index 08a082d..1ce7aec 100644
--- a/src/access_x86_msr.c
+++ b/src/access_x86_msr.c
@@ -10,7 +10,7 @@
  *                   is based on the msr-util tools.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com.
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/access_x86_pci.c b/src/access_x86_pci.c
index c96f775..81ea0ae 100644
--- a/src/access_x86_pci.c
+++ b/src/access_x86_pci.c
@@ -1,7 +1,7 @@
 /*
  * =======================================================================================
  *
- *      Filename:  pci.c
+ *      Filename:  access_x86_pci.c
  *
  *      Description:  Implementation of pci module.
  *                   Provides API to read and write values to the hardware
@@ -9,7 +9,7 @@
  *                   for Intel Sandy Bridge Processors.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com,
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/affinity.c b/src/affinity.c
index 40f9e83..ec27643 100644
--- a/src/affinity.c
+++ b/src/affinity.c
@@ -6,7 +6,7 @@
  *      Description:  Implementation of affinity module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com,
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/applications/likwid-agent.lua b/src/applications/likwid-agent.lua
index 3f3e59a..b557cbc 100644
--- a/src/applications/likwid-agent.lua
+++ b/src/applications/likwid-agent.lua
@@ -7,7 +7,7 @@
  *      Description:  A monitoring daemon for hardware performance counters.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
diff --git a/src/applications/likwid-features.lua b/src/applications/likwid-features.lua
index 37d765d..787aa22 100644
--- a/src/applications/likwid-features.lua
+++ b/src/applications/likwid-features.lua
@@ -7,7 +7,7 @@
  *      Description:  A application to retrieve and manipulate CPU features.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
@@ -32,24 +32,27 @@ package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
 
 local likwid = require("likwid")
 
+print_stdout = print
+print_stderr = function(...) for k,v in pairs({...}) do io.stderr:write(v .. "\n") end end
+
 function version()
-    print(string.format("likwid-features --  Version %d.%d",likwid.version,likwid.release))
+    print_stdout(string.format("likwid-features --  Version %d.%d",likwid.version,likwid.release))
 end
 
 function usage()
     version()
-    print("A tool list and modify the states of CPU features.\n")
-    print("Options:")
-    print("-h, --help\t\t Help message")
-    print("-v, --version\t\t Version information")
-    print("-a, --all\t\t List all available features")
-    print("-l, --list\t\t List features and state for given CPUs")
-    print("-c, --cpus <list>\t Perform operations on given CPUs")
-    print("-e, --enable <list>\t List of features that should be enabled")
-    print("-d, --disable <list>\t List of features that should be disabled")
-    print()
-    print("Currently modifiable features:")
-    print("HW_PREFETCHER, CL_PREFETCHER, DCU_PREFETCHER, IP_PREFETCHER")
+    print_stdout("A tool list and modify the states of CPU features.\n")
+    print_stdout("Options:")
+    print_stdout("-h, --help\t\t Help message")
+    print_stdout("-v, --version\t\t Version information")
+    print_stdout("-a, --all\t\t List all available features")
+    print_stdout("-l, --list\t\t List features and state for given CPUs")
+    print_stdout("-c, --cpus <list>\t Perform operations on given CPUs")
+    print_stdout("-e, --enable <list>\t List of features that should be enabled")
+    print_stdout("-d, --disable <list>\t List of features that should be disabled")
+    print_stdout()
+    print_stdout("Currently modifiable features:")
+    print_stdout("HW_PREFETCHER, CL_PREFETCHER, DCU_PREFETCHER, IP_PREFETCHER")
 end
 
 if #arg == 0 then
@@ -68,8 +71,8 @@ for opt,arg in likwid.getopt(arg, {"h","v","l","c:","e:","d:","a","help","versio
     if (type(arg) == "string") then
         local s,e = arg:find("-");
         if s == 1 then
-            print(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
-            print("Did you forget an argument to an option?")
+            print_stderr(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print_stderr("Did you forget an argument to an option?")
             os.exit(1)
         end
     end
@@ -84,15 +87,15 @@ for opt,arg in likwid.getopt(arg, {"h","v","l","c:","e:","d:","a","help","versio
     elseif opt == "l" or opt == "list" then
         listFeatures = true
     elseif opt == "a" or opt == "all" then
-        print("Available features:")
+        print_stdout("Available features:")
         for i=0,likwid.tablelength(likwid.cpuFeatures)-1 do
             if likwid.cpuFeatures[i]:match("PREFETCHER") then
-                print(string.format("\t%s*",likwid.cpuFeatures[i]))
+                print_stdout(string.format("\t%s*",likwid.cpuFeatures[i]))
             else
-                print(string.format("\t%s",likwid.cpuFeatures[i]))
+                print_stdout(string.format("\t%s",likwid.cpuFeatures[i]))
             end
         end
-        print("Modifiable features are marked with *")
+        print_stdout("Modifiable features are marked with *")
         os.exit(0)
     elseif opt == "e" or opt == "enable" then
         local tmp = likwid.stringsplit(arg, ",")
@@ -113,10 +116,10 @@ for opt,arg in likwid.getopt(arg, {"h","v","l","c:","e:","d:","a","help","versio
             end
         end
     elseif opt == "?" then
-        print("Invalid commandline option -"..arg)
+        print_stderr("Invalid commandline option -"..arg)
         os.exit(1)
     elseif opt == "!" then
-        print("Option requires an argument")
+        print_stderr("Option requires an argument")
         os.exit(1)
     end
 end
@@ -128,7 +131,7 @@ if listFeatures and #cpulist > 0 then
     for j, c in pairs(cpulist) do
         str = str..string.format("CPU %d\t",c)
     end
-    print(str)
+    print_stdout(str)
     str = ""
     for i=0,likwid.tablelength(likwid.cpuFeatures)-1 do
         str = likwid.cpuFeatures[i]..string.rep(" ",string.len("BRANCH_TRACE_STORAGE")-string.len(likwid.cpuFeatures[i])+2)
@@ -139,10 +142,10 @@ if listFeatures and #cpulist > 0 then
                 str = str .. "off\t"
             end
         end
-        print(str)
+        print_stdout(str)
     end
 elseif #cpulist == 0 then
-    print("Need CPU to list current feature state")
+    print_stderr("Need CPU to list current feature state")
     os.exit(1)
 end
 
@@ -150,7 +153,7 @@ if #enableList > 0 and #disableList > 0 then
     for i,e in pairs(enableList) do
         for j, d in pairs(disableList) do
             if (e == d) then
-                print(string.format("Feature %s is in enable and disable list, doing nothing for feature", e))
+                print_stderr(string.format("Feature %s is in enable and disable list, doing nothing for feature", e))
                 table.insert(skipList, e)
             end
         end
@@ -170,9 +173,9 @@ if #enableList > 0 then
         for j, f in pairs(enableList) do
             local ret = likwid.enableCpuFeatures(c, f, 1)
             if ret == 0 then
-                print(string.format("Enabled %s for CPU %d", likwid.cpuFeatures[f], c))
+                print_stdout(string.format("Enabled %s for CPU %d", likwid.cpuFeatures[f], c))
             else
-                print(string.format("Failed %s for CPU %d", likwid.cpuFeatures[f], c))
+                print_stdout(string.format("Failed %s for CPU %d", likwid.cpuFeatures[f], c))
             end
         end
     end
@@ -182,9 +185,9 @@ if #disableList > 0 then
         for j, f in pairs(disableList) do
             local ret = likwid.disableCpuFeatures(c, f, 1)
             if ret == 0 then
-                print(string.format("Disabled %s for CPU %d", likwid.cpuFeatures[f], c))
+                print_stdout(string.format("Disabled %s for CPU %d", likwid.cpuFeatures[f], c))
             else
-                print(string.format("Failed %s for CPU %d", likwid.cpuFeatures[f], c))
+                print_stdout(string.format("Failed %s for CPU %d", likwid.cpuFeatures[f], c))
             end
         end
     end
diff --git a/src/applications/likwid-genTopoCfg.lua b/src/applications/likwid-genTopoCfg.lua
index fdd4d69..845c359 100644
--- a/src/applications/likwid-genTopoCfg.lua
+++ b/src/applications/likwid-genTopoCfg.lua
@@ -9,7 +9,7 @@
  *                    each start.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
@@ -34,23 +34,26 @@ package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
 
 local likwid = require("likwid")
 
+print_stdout = print
+print_stderr = function(...) for k,v in pairs({...}) do io.stderr:write(v .. "\n") end end
+
 local filename = "<INSTALLED_PREFIX>/etc/likwid_topo.cfg"
 
 function version()
-    print(string.format("likwid-genTopoCfg --  Version %d.%d",likwid.version,likwid.release))
+    print_stdout(string.format("likwid-genTopoCfg --  Version %d.%d",likwid.version,likwid.release))
 end
 
 function usage()
     version()
-    print("A tool to store the system's architecture to a config file for LIKWID.\n")
-    print("Options:")
-    print("-h, --help\t\t Help message")
-    print("-v, --version\t\t Version information")
-    print("-o, --output <file>\t Use <file> instead of default "..filename)
-    print("\t\t\t Likwid searches at startup per default:")
-    print("\t\t\t /etc/likwid_topo.cfg and <INSTALLED_PREFIX>/etc/likwid_topo.cfg")
-    print("\t\t\t Another location can be configured in the configuration file /etc/likwid.cfg,")
-    print("\t\t\t <INSTALLED_PREFIX>/etc/likwid.cfg or the path defined at the build process of Likwid.")
+    print_stdout("A tool to store the system's architecture to a config file for LIKWID.\n")
+    print_stdout("Options:")
+    print_stdout("-h, --help\t\t Help message")
+    print_stdout("-v, --version\t\t Version information")
+    print_stdout("-o, --output <file>\t Use <file> instead of default "..filename)
+    print_stdout("\t\t\t Likwid searches at startup per default:")
+    print_stdout("\t\t\t /etc/likwid_topo.cfg and <INSTALLED_PREFIX>/etc/likwid_topo.cfg")
+    print_stdout("\t\t\t Another location can be configured in the configuration file /etc/likwid.cfg,")
+    print_stdout("\t\t\t <INSTALLED_PREFIX>/etc/likwid.cfg or the path defined at the build process of Likwid.")
 end
 
 for opt,arg in likwid.getopt(arg, {"h","v","help","version", "o:", "output:"}) do
@@ -63,22 +66,22 @@ for opt,arg in likwid.getopt(arg, {"h","v","help","version", "o:", "output:"}) d
     elseif opt == "o" or opt == "output" then
         filename = arg
     elseif opt == "?" then
-        print("Invalid commandline option -"..arg)
+        print_stderr("Invalid commandline option -"..arg)
         os.exit(1)
     elseif opt == "!" then
-        print("Option requires an argument")
+        print_stderr("Option requires an argument")
         os.exit(1)
     end
 end
 local file = io.open(filename, "r")
 if file ~= nil then
-    print("File "..filename.." exists, please delete it first.")
+    print_stderr("File "..filename.." exists, please delete it first.")
     file:close()
     os.exit(1)
 end
 file = io.open(filename, "w")
 if file == nil then
-    print("Cannot open file "..filename.." for writing")
+    print_stderr("Cannot open file "..filename.." for writing")
     os.exit(1)
 end
 
@@ -88,10 +91,10 @@ local cputopo = likwid.getCpuTopology()
 local numainfo = likwid.getNumaInfo()
 local affinity = likwid.getAffinityInfo()
 if cpuinfo == nil or cputopo == nil or numainfo == nil or affinity == nil then
-    print("Cannot initialize topology module of LIKWID")
+    print_stderr("Cannot initialize topology module of LIKWID")
     os.exit(1)
 end
-print(string.format("Writing new topology file %s", filename))
+print_stdout(string.format("Writing new topology file %s", filename))
 cpuinfo["clock"] = likwid.getCpuClock()
 
 local threadPool_order = {"threadId", "coreId", "packageId", "apicId"}
diff --git a/src/applications/likwid-memsweeper.lua b/src/applications/likwid-memsweeper.lua
index d3315ac..999dedd 100644
--- a/src/applications/likwid-memsweeper.lua
+++ b/src/applications/likwid-memsweeper.lua
@@ -7,7 +7,7 @@
  *      Description:  An application to clean up NUMA memory domains.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
@@ -31,29 +31,32 @@
 package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
 local likwid = require("likwid")
 
+print_stdout = print
+print_stderr = function(...) for k,v in pairs({...}) do io.stderr:write(v .. "\n") end end
+
 local function version()
-    print(string.format("likwid-memsweeper --  Version %d.%d",likwid.version,likwid.release))
+    print_stdout(string.format("likwid-memsweeper --  Version %d.%d",likwid.version,likwid.release))
 end
 
 local function examples()
-    print("Examples:")
-    print("To clean specific domain:")
-    print("likwid-memsweeper -c 2")
-    print("To clean a range of domains:")
-    print("likwid-memsweeper -c 1-2")
-    print("To clean specific domains:")
-    print("likwid-memsweeper -c 0,1-2")
+    print_stdout("Examples:")
+    print_stdout("To clean specific domain:")
+    print_stdout("likwid-memsweeper -c 2")
+    print_stdout("To clean a range of domains:")
+    print_stdout("likwid-memsweeper -c 1-2")
+    print_stdout("To clean specific domains:")
+    print_stdout("likwid-memsweeper -c 0,1-2")
 
 end
 
 local function usage()
     version()
-    print("A tool clean up NUMA memory domains.\n")
-    print("Options:")
-    print("-h\t\t Help message")
-    print("-v\t\t Version information")
-    print("-c <list>\t Specify NUMA domain ID to clean up")
-    print("")
+    print_stdout("A tool clean up NUMA memory domains.\n")
+    print_stdout("Options:")
+    print_stdout("-h\t\t Help message")
+    print_stdout("-v\t\t Version information")
+    print_stdout("-c <list>\t Specify NUMA domain ID to clean up")
+    print_stdout("")
     examples()
 end
 
@@ -75,10 +78,10 @@ for opt,arg in likwid.getopt(arg, {"c:", "h", "v", "help", "version"}) do
     elseif (opt == "c") then
         num_nodes, nodes = likwid.nodestr_to_nodelist(arg)
     elseif opt == "?" then
-        print("Invalid commandline option -"..arg)
+        print_stderr("Invalid commandline option -"..arg)
         os.exit(1)
     elseif opt == "!" then
-        print("Option requires an argument")
+        print_stderr("Option requires an argument")
         os.exit(1)
     end
 end
diff --git a/src/applications/likwid-mpirun.lua b/src/applications/likwid-mpirun.lua
index 07d6dc4..7c090d6 100644
--- a/src/applications/likwid-mpirun.lua
+++ b/src/applications/likwid-mpirun.lua
@@ -8,7 +8,7 @@
  *                   measure hardware performance counters
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
@@ -33,53 +33,56 @@ package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
 
 local likwid = require("likwid")
 
+print_stdout = print
+print_stderr = function(...) for k,v in pairs({...}) do io.stderr:write(v .. "\n") end end
+
 local function version()
-    print(string.format("likwid-mpirun --  Version %d.%d",likwid.version,likwid.release))
+    print_stdout(string.format("likwid-mpirun --  Version %d.%d",likwid.version,likwid.release))
 end
 
 local function examples()
-    print("Examples:")
-    print("Run 32 processes on hosts in hostlist")
-    print("likwid-mpirun -np 32 ./a.out")
-    print("")
-    print("Run 1 MPI process on each socket")
-    print("likwid-mpirun -nperdomain S:1 ./a.out")
-    print("Total amount of MPI processes is calculated using the number of hosts in the hostfile")
-    print("")
-    print("For hybrid MPI/OpenMP jobs you need to set the -pin option")
-    print("Starts 2 MPI processes on each host, one on socket 0 and one on socket 1")
-    print("Each MPI processes may start 2 OpenMP threads pinned to the first two CPUs on each socket")
-    print("likwid-mpirun -pin S0:0-1_S1:0-1 ./a.out")
-    print("")
-    print("Run 2 processes on each socket and measure the MEM performance group")
-    print("likwid-mpirun -nperdomain S:2 -g MEM ./a.out")
-    print("Only one process on a socket measures the Uncore/RAPL counters, the other one(s) only core-local counters")
-    print("")
+    print_stdout("Examples:")
+    print_stdout("Run 32 processes on hosts in hostlist")
+    print_stdout("likwid-mpirun -np 32 ./a.out")
+    print_stdout("")
+    print_stdout("Run 1 MPI process on each socket")
+    print_stdout("likwid-mpirun -nperdomain S:1 ./a.out")
+    print_stdout("Total amount of MPI processes is calculated using the number of hosts in the hostfile")
+    print_stdout("")
+    print_stdout("For hybrid MPI/OpenMP jobs you need to set the -pin option")
+    print_stdout("Starts 2 MPI processes on each host, one on socket 0 and one on socket 1")
+    print_stdout("Each MPI processes may start 2 OpenMP threads pinned to the first two CPUs on each socket")
+    print_stdout("likwid-mpirun -pin S0:0-1_S1:0-1 ./a.out")
+    print_stdout("")
+    print_stdout("Run 2 processes on each socket and measure the MEM performance group")
+    print_stdout("likwid-mpirun -nperdomain S:2 -g MEM ./a.out")
+    print_stdout("Only one process on a socket measures the Uncore/RAPL counters, the other one(s) only core-local counters")
+    print_stdout("")
 end
 
 local function usage()
     version()
-    print("A wrapper script to pin threads spawned by MPI processes and measure hardware performance counters.\n")
-    print("Options:")
-    print("-h, --help\t\t Help message")
-    print("-v, --version\t\t Version information")
-    print("-d, --debug\t\t Debugging output")
-    print("-n/-np <count>\t\t Set the number of processes")
-    print("-nperdomain <domain>\t Set the number of processes per node by giving an affinity domain and count")
-    print("-pin <list>\t\t Specify pinning of threads. CPU expressions like likwid-pin separated with '_'")
-    print("-s, --skip <hex>\t Bitmask with threads to skip")
-    print("-mpi <id>\t\t Specify which MPI should be used. Possible values: openmpi, intelmpi and mvapich2")
-    print("\t\t\t If not set, module system is checked")
-    print("-omp <id>\t\t Specify which OpenMP should be used. Possible values: gnu and intel")
-    print("\t\t\t Only required for statically linked executables.")
-    print("-hostfile\t\t Use custom hostfile instead of searching the environment")
-    print("-g/-group <perf>\t Set a likwid-perfctr conform event set for measuring on nodes")
-    print("-m/-marker\t\t Activate marker API mode")
-    print("-O\t\t\t Output easily parseable CSV instead of fancy tables")
-    print("-f\t\t\t Force overwrite of registers if they are in use. You can also use environment variable LIKWID_FORCE")
-    print("")
-    print("Processes are pinned to physical CPU cores first. For syntax questions see likwid-pin")
-    print("")
+    print_stdout("A wrapper script to pin threads spawned by MPI processes and measure hardware performance counters.\n")
+    print_stdout("Options:")
+    print_stdout("-h, --help\t\t Help message")
+    print_stdout("-v, --version\t\t Version information")
+    print_stdout("-d, --debug\t\t Debugging output")
+    print_stdout("-n/-np <count>\t\t Set the number of processes")
+    print_stdout("-nperdomain <domain>\t Set the number of processes per node by giving an affinity domain and count")
+    print_stdout("-pin <list>\t\t Specify pinning of threads. CPU expressions like likwid-pin separated with '_'")
+    print_stdout("-s, --skip <hex>\t Bitmask with threads to skip")
+    print_stdout("-mpi <id>\t\t Specify which MPI should be used. Possible values: openmpi, intelmpi and mvapich2")
+    print_stdout("\t\t\t If not set, module system is checked")
+    print_stdout("-omp <id>\t\t Specify which OpenMP should be used. Possible values: gnu and intel")
+    print_stdout("\t\t\t Only required for statically linked executables.")
+    print_stdout("-hostfile\t\t Use custom hostfile instead of searching the environment")
+    print_stdout("-g/-group <perf>\t Set a likwid-perfctr conform event set for measuring on nodes")
+    print_stdout("-m/-marker\t\t Activate marker API mode")
+    print_stdout("-O\t\t\t Output easily parseable CSV instead of fancy tables")
+    print_stdout("-f\t\t\t Force overwrite of registers if they are in use. You can also use environment variable LIKWID_FORCE")
+    print_stdout("")
+    print_stdout("Processes are pinned to physical CPU cores first. For syntax questions see likwid-pin")
+    print_stdout("")
     examples()
 end
 
@@ -96,6 +99,7 @@ local mpitype = nil
 local omptype = nil
 local skipStr = ""
 local executable = {}
+local mpiopts = {}
 local debug = false
 local use_marker = false
 local use_csv = false
@@ -112,6 +116,7 @@ local writeHostfile = nil
 local getEnvironment = nil
 local executeCommand = nil
 local mpiexecutable = nil
+local hostpattern = "([%.%a%d_-]+)"
 
 
 local function readHostfileOpenMPI(filename)
@@ -121,21 +126,21 @@ local function readHostfileOpenMPI(filename)
     end
     local f = io.open(filename, "r")
     if f == nil then
-        print("ERROR: Cannot open hostfile "..filename)
+        print_stderr("ERROR: Cannot open hostfile "..filename)
         os.exit(1)
     end
     if debug then
-        print("DEBUG: Reading hostfile in openmpi style")
+        print_stdout("DEBUG: Reading hostfile in openmpi style")
     end
     local t = f:read("*all")
     f:close()
     for i, line in pairs(likwid.stringsplit(t,"\n")) do
         if line:match("^#") == nil and line:match("^%s*$") == nil then
-            hostname, slots, maxslots = line:match("^([%.%a%d]+)%s+slots=(%d*)%s+max%-slots=(%d*)")
+            hostname, slots, maxslots = line:match("^"..hostpattern.."%s+slots=(%d*)%s+max%-slots=(%d*)")
             if not hostname then
-                hostname, slots = line:match("^([%.%a%d]+)%s+slots=(%d*)")
+                hostname, slots = line:match("^"..hostpattern.."%s+slots=(%d*)")
                 if not hostname then
-                    hostname = line:match("^([%.%a%d]+)")
+                    hostname = line:match("^"..hostpattern)
                     slots = 1
                     maxslots = 1
                 end
@@ -166,7 +171,7 @@ local function readHostfileOpenMPI(filename)
             host["maxslots"] = topo.numHWThreads
         end
         if debug then
-            print(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
+            print_stdout(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
         end
     end
     return hostlist
@@ -179,7 +184,7 @@ local function writeHostfileOpenMPI(hostlist, filename)
 
     local f = io.open(filename, "w")
     if f == nil then
-        print("ERROR: Cannot open hostfile "..filename)
+        print_stderr("ERROR: Cannot open hostfile "..filename)
         os.exit(1)
     end
     for i, hostcontent in pairs(hostlist) do
@@ -219,13 +224,14 @@ local function executeOpenMPI(wrapperscript, hostfile, env, nrNodes)
         f:close()
     end
 
-    local cmd = string.format("%s -hostfile %s %s -np %d -npernode %d %s",
+    local cmd = string.format("%s -hostfile %s %s -np %d -npernode %d %s %s",
                                 mpiexecutable, hostfile, bindstr,
-                                np, ppn, wrapperscript)
+                                np, ppn, table.concat(mpiopts, ' '), wrapperscript)
     if debug then
-        print("EXEC: "..cmd)
+        print_stdout("EXEC: "..cmd)
     end
-    os.execute(cmd)
+    local ret = os.execute(cmd)
+    return ret
 end
 
 local function readHostfileIntelMPI(filename)
@@ -235,28 +241,28 @@ local function readHostfileIntelMPI(filename)
     end
     local f = io.open(filename, "r")
     if f == nil then
-        print("ERROR: Cannot open hostfile "..filename)
+        print_stderr("ERROR: Cannot open hostfile "..filename)
         os.exit(1)
     end
     if debug then
-        print("DEBUG: Reading hostfile in intelmpi style")
+        print_stdout("DEBUG: Reading hostfile in intelmpi style")
     end
     local topo = likwid.getCpuTopology()
     local t = f:read("*all")
     f:close()
     for i, line in pairs(likwid.stringsplit(t,"\n")) do
         if line:match("^#") == nil and line:match("^%s*$") == nil then
-            hostname, slots = line:match("^([%.%a%d]+):(%d+)")
+            hostname, slots = line:match("^"..hostpattern..":(%d+)")
             if not hostname then
-                hostname = line:match("^([%.%a%d]+)")
+                hostname = line:match("^"..hostpattern)
                 slots = topo["numHWThreads"]
             end
-            table.insert(hostlist, {hostname=hostname, slots=slots, maxslots=slots})
+            table.insert(hostlist, {hostname=hostname, slots=tonumber(slots), maxslots=tonumber(slots)})
         end
     end
     if debug then
         for i, host in pairs(hostlist) do
-            print(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
+            print_stdout(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
         end
     end
     return hostlist
@@ -269,7 +275,7 @@ local function writeHostfileIntelMPI(hostlist, filename)
 
     local f = io.open(filename, "w")
     if f == nil then
-        print("ERROR: Cannot open hostfile "..filename)
+        print_stderr("ERROR: Cannot open hostfile "..filename)
         os.exit(1)
     end
     for i, hostcontent in pairs(hostlist) do
@@ -291,6 +297,7 @@ end
 
 local function executeIntelMPI(wrapperscript, hostfile, env, nrNodes)
     local use_hydra = true
+    local mpi_connect = "ssh"
     if wrapperscript.sub(1,1) ~= "/" then
         wrapperscript = os.getenv("PWD").."/"..wrapperscript
     end
@@ -322,25 +329,33 @@ local function executeIntelMPI(wrapperscript, hostfile, env, nrNodes)
             envstr = envstr .. string.format("-env %s %s ", i, e)
         end
     end
+    for i,e in pairs(mpiopts) do
+        envstr = envstr .. string.format("%s ",e)
+    end
+    if os.getenv("LIKWID_MPI_CONNECT") ~= nil then
+        mpi_connect = os.getenv("LIKWID_MPI_CONNECT")
+    end
 
     if debug then
         if use_hydra == false then
-            print(string.format("EXEC: %s/mpdboot -r ssh -n %d -f %s", path, nrNodes, hostfile))
-            print(string.format("EXEC: %s/mpiexec -perhost %d %s -np %d %s", path, ppn, envstr, np, wrapperscript))
-            print(string.format("EXEC: %s/mpdallexit", path))
+            print_stdout(string.format("EXEC: %s/mpdboot -r %s -n %d -f %s", path, mpi_connect, nrNodes, hostfile))
+            print_stdout(string.format("EXEC: %s/mpiexec -perhost %d %s -np %d %s", path, ppn, envstr, np, wrapperscript))
+            print_stdout(string.format("EXEC: %s/mpdallexit", path))
         else
-            print(string.format("%s %s -f %s -np %d -perhost %d %s",mpiexecutable, envstr, hostfile, np, ppn, wrapperscript))
+            print_stdout(string.format("%s %s -f %s -np %d -perhost %d %s",mpiexecutable, envstr, hostfile, np, ppn, wrapperscript))
         end
     end
 
     --os.execute(string.format("%s -genv I_MPI_PIN 0 -f %s -np %d -perhost %d %s",mpiexecutable, hostfile, np, ppn, wrapperscript))
+    local ret = 0
     if use_hydra == false then
-        os.execute(string.format("%s/mpdboot -r ssh -n %d -f %s", path, nrNodes, hostfile))
-        os.execute(string.format("%s/mpiexec -perhost %d %s -np %d %s", path, ppn, envstr, np, wrapperscript))
-        os.execute(string.format("%s/mpdallexit", path))
+        ret = os.execute(string.format("%s/mpdboot -r %s -n %d -f %s", path, mpi_connect, nrNodes, hostfile))
+        ret = os.execute(string.format("%s/mpiexec -perhost %d %s -np %d %s", path, ppn, envstr, np, wrapperscript))
+        ret = os.execute(string.format("%s/mpdallexit", path))
     else
-        os.execute(string.format("%s %s -f %s -np %d -perhost %d %s",mpiexecutable, envstr, hostfile, np, ppn, wrapperscript))
+        ret = os.execute(string.format("%s %s -f %s -np %d -perhost %d %s",mpiexecutable, envstr, hostfile, np, ppn, wrapperscript))
     end
+    return ret
 end
 
 local function readHostfileMvapich2(filename)
@@ -350,33 +365,33 @@ local function readHostfileMvapich2(filename)
     end
     local f = io.open(filename, "r")
     if f == nil then
-        print("ERROR: Cannot open hostfile "..filename)
+        print_stderr("ERROR: Cannot open hostfile "..filename)
         os.exit(1)
     end
     if debug then
-        print("DEBUG: Reading hostfile in mvapich2 style")
+        print_stdout("DEBUG: Reading hostfile in mvapich2 style")
     end
     local t = f:read("*all")
     f:close()
     for i, line in pairs(likwid.stringsplit(t,"\n")) do
         if line:match("^#") == nil and line:match("^%s*$") == nil then
-            hostname, slots, interface = line:match("^([%.%a%d]+):(%d+):([%a%d]+)")
+            hostname, slots, interface = line:match("^"..hostpattern..":(%d+):([%a%d]+)")
             if not hostname then
-                hostname, slots = line:match("^([%.%a%d]+):(%d+)")
+                hostname, slots = line:match("^"..hostpattern..":(%d+)")
                 if not hostname then
-                    hostname = line:match("^([%.%a%d]+)")
+                    hostname = line:match("^"..hostpattern)
                     slots = 1
                     interface = nil
                 else
                     interface = nil
                 end
             end
-            table.insert(hostlist, {hostname=hostname, slots=slots, maxslots=slots, interface=interface})
+            table.insert(hostlist, {hostname=hostname, slots=tonumber(slots), maxslots=tonumber(slots), interface=interface})
         end
     end
     if debug then
         for i, host in pairs(hostlist) do
-            print(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
+            print_stdout(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
         end
     end
     return hostlist
@@ -389,7 +404,7 @@ local function writeHostfileMvapich2(hostlist, filename)
 
     local f = io.open(filename, "w")
     if f == nil then
-        print("ERROR: Cannot open hostfile "..filename)
+        print_stderr("ERROR: Cannot open hostfile "..filename)
         os.exit(1)
     end
     for i, hostcontent in pairs(hostlist) do
@@ -424,13 +439,14 @@ local function executeMvapich2(wrapperscript, hostfile, env, nrNodes)
         envstr = envstr .. string.format("%s=%s ", i, e)
     end
 
-    local cmd = string.format("%s -f %s -np %d -ppn %d %s %s",
+    local cmd = string.format("%s -f %s -np %d -ppn %d %s %s %s",
                                 mpiexecutable, hostfile,
-                                np, ppn, envstr, wrapperscript)
+                                np, ppn, envstr, table.concat(mpiopts, ' '), wrapperscript)
     if debug then
-        print("EXEC: "..cmd)
+        print_stdout("EXEC: "..cmd)
     end
-    os.execute(cmd)
+    local ret = os.execute(cmd)
+    return ret
 end
 
 
@@ -441,17 +457,17 @@ local function readHostfilePBS(filename)
     end
     local f = io.open(filename, "r")
     if f == nil then
-        print("ERROR: Cannot open hostfile "..filename)
+        print_stderr("ERROR: Cannot open hostfile "..filename)
         os.exit(1)
     end
     if debug then
-        print("DEBUG: Reading hostfile from batch system")
+        print_stdout("DEBUG: Reading hostfile from batch system")
     end
     local t = f:read("*all")
     f:close()
     for i, line in pairs(likwid.stringsplit(t,"\n")) do
         if line:match("^#") == nil and line:match("^%s*$") == nil then
-            hostname = line:match("^([%.%a%d]+)")
+            hostname = line:match("^"..hostpattern)
             local found = false
             for i, host in pairs(hostlist) do
                 if host["hostname"] == hostname then
@@ -468,7 +484,7 @@ local function readHostfilePBS(filename)
     end
     if debug then
         for i, host in pairs(hostlist) do
-            print(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
+            print_stdout(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
         end
     end
     return hostlist
@@ -496,7 +512,7 @@ function write_hostlist_to_file(hostlist, nperhost)
         else
             prefixzeros = 0
             
-            host, start, ende,remain = item:match("(%w+)%[(%d+)-(%d+)%]([%w%d%[%]-]*)")
+            host, start, ende,remain = item:match("(%a+)%[(%d+)-(%d+)%]([%w%d%[%]-]*)")
             if host and start and ende then
                 if tonumber(start) ~= 0 then
                     for j=1,#start do
@@ -536,7 +552,6 @@ local function writeHostfileSlurm(hostlist, filename)
     for i, h in pairs(hostlist) do
         table.insert(l, h["hostname"])
     end
-    print("SLURM_NODELIST", table.concat(l,","))
     likwid.setenv("SLURM_NODELIST", table.concat(l,","))
 end
 
@@ -549,12 +564,15 @@ local function executeSlurm(wrapperscript, hostfile, env, nrNodes)
         wrapperscript = os.getenv("PWD").."/"..wrapperscript
     end
     
-    local exec = string.format("srun -N %d --ntasks-per-node=%d --cpu_bind=none %s", nrNodes, ppn, wrapperscript)
+    local exec = string.format("srun -N %d --ntasks-per-node=%d --cpu_bind=none %s %s",
+                                nrNodes, ppn, table.concat(mpiopts, ' '), wrapperscript)
     if debug then
-        print("EXEC: "..exec)
+        print_stdout("EXEC: "..exec)
     end
-    os.execute(exec)
+    local ret = os.execute(exec)
+    return ret
 end
+
 local function getNumberOfNodes(hostlist)
     local n = 0
     for i, h in pairs(hostlist) do
@@ -636,7 +654,7 @@ local function getMpiType()
         end
     end
     if not mpitype then
-        print("WARN: No supported MPI loaded in module system")
+        print_stderr("WARN: No supported MPI loaded in module system")
     end
     return mpitype
 end
@@ -707,7 +725,7 @@ local function getOmpType()
         end
     end
     if not omptype and dyn_linked == false then
-        print("WARN: Cannot get OpenMP variant from executable, trying module system")
+        print_stderr("WARN: Cannot get OpenMP variant from executable, trying module system")
         cmd = "bash -c 'tclsh /apps/modules/modulecmd.tcl sh list -t' 2>&1"
         local f = io.popen(cmd, 'r')
         if f == nil then
@@ -728,7 +746,7 @@ local function getOmpType()
             end
         end
         if not omptype then
-            print("WARN: No supported OpenMP loaded in module system")
+            print_stderr("WARN: No supported OpenMP loaded in module system")
         end
     end
     if omptype == "none" then
@@ -742,11 +760,11 @@ local function assignHosts(hosts, np, ppn)
     newhosts = {}
     current = 0
     if debug then
-        print(string.format("Assign %d processes with %d per node to %d hosts", np, ppn, #hosts))
-        print("Available hosts for scheduling:")
-        print("Host", "Slots", "MaxSlots", "Interface")
+        print_stdout(string.format("Assign %d processes with %d per node to %d hosts", np, ppn, #hosts))
+        print_stdout("Available hosts for scheduling:")
+        print_stdout("Host", "Slots", "MaxSlots", "Interface")
         for i, h in pairs(hosts) do
-            print (h["hostname"], h["slots"], h["maxslots"],"", h["interface"])
+            print_stdout (h["hostname"], h["slots"], h["maxslots"],"", h["interface"])
         end
     end
     local break_while = false
@@ -759,7 +777,7 @@ local function assignHosts(hosts, np, ppn)
                                             maxslots=host["maxslots"],
                                             interface=host["interface"]})
                     if debug then
-                        print(string.format("DEBUG: Add Host %s with %d slots to host list", host["hostname"], host["maxslots"]))
+                        print_stdout(string.format("DEBUG: Add Host %s with %d slots to host list", host["hostname"], host["maxslots"]))
                     end
                     current = host["maxslots"]
                     hosts[i] = nil
@@ -769,15 +787,15 @@ local function assignHosts(hosts, np, ppn)
                                             maxslots=host["slots"],
                                             interface=host["interface"]})
                     if debug then
-                        print(string.format("DEBUG: Add Host %s with %d slots to host list", host["hostname"], ppn))
+                        print_stdout(string.format("DEBUG: Add Host %s with %d slots to host list", host["hostname"], ppn))
                     end
                     current = ppn
                     hosts[i] = nil
                 end
             elseif host["slots"] then
-                if host["maxslots"] then
+                --[[if host["maxslots"] then
                     if host["maxslots"] < ppn then
-                        print(string.format("WARN: Oversubscription for host %s needed, but max-slots set to %d.",
+                        print_stderr(string.format("WARN: Oversubscription for host %s needed, but max-slots set to %d.",
                                                 host["hostname"], host["maxslots"]))
                         table.insert(newhosts, {hostname=host["hostname"],
                                                 slots=host["maxslots"],
@@ -787,7 +805,7 @@ local function assignHosts(hosts, np, ppn)
                         host["maxslots"] = 0
                         hosts[i] = nil
                     else
-                        print(string.format("WARN: Oversubscription for host %s.", host["hostname"]))
+                        print_stderr(string.format("WARN: Oversubscription for host %s.", host["hostname"]))
                         table.insert(newhosts, {hostname=host["hostname"],
                                             slots=ppn,
                                             maxslots=host["maxslots"],
@@ -796,20 +814,22 @@ local function assignHosts(hosts, np, ppn)
                         
                     end
                 else
-                    print(string.format("WARN: Oversubscription for host %s.", host["hostname"]))
+                    print_stderr(string.format("WARN: Oversubscription for host %s.", host["hostname"]))
                     table.insert(newhosts, {hostname=host["hostname"],
                                         slots=ppn,
                                         maxslots=host["slots"],
                                         interface=host["interface"]})
                     current = ppn
-                end
+                end]]
+                print_stderr(string.format("ERROR: Oversubscription required. Host %s has only %s slots but %d needed per host", host["hostname"], host["slots"], ppn))
+                os.exit(1)
             else
                 table.insert(newhosts, {hostname=host["hostname"],
                                         slots=ppn,
                                         maxslots=host["slots"],
                                         interface=host["interface"]})
                 if debug then
-                    print(string.format("DEBUG: Add Host %s with %d slots to host list", host["hostname"], ppn))
+                    print_stdout(string.format("DEBUG: Add Host %s with %d slots to host list", host["hostname"], ppn))
                 end
                 current = ppn
             end
@@ -844,7 +864,7 @@ local function assignHosts(hosts, np, ppn)
         end
     end
     if debug then
-        print("DEBUG: Scheduling on hosts:")
+        print_stdout("DEBUG: Scheduling on hosts:")
         for i, h in pairs(newhosts) do
             if h["maxslots"] ~= nil then
                 str = string.format("DEBUG: Host %s with %d slots (max. %d slots)",
@@ -855,7 +875,7 @@ local function assignHosts(hosts, np, ppn)
             if h["interface"] then
                 str = str.. string.format(" using interface %s", h["interface"])
             end
-            print(str)
+            print_stdout(str)
         end
     end
     return newhosts, ppn
@@ -891,7 +911,7 @@ local function calculateCpuExprs(nperdomain, cpuexprs)
         for i, idx in pairs(domainlist) do
             str = str .. affinity["domains"][idx]["tag"] .. " "
         end
-        print(str)
+        print_stdout(str)
     end
 
     for i, domidx in pairs(domainlist) do
@@ -912,14 +932,14 @@ local function calculateCpuExprs(nperdomain, cpuexprs)
         for i, expr in pairs(newexprs) do
             str = str .. expr .. " "
         end
-        print(str)
+        print_stdout(str)
     end
     return newexprs
 end
 
 local function createEventString(eventlist)
     if eventlist == nil or #eventlist == 0 then
-        print("ERROR: Empty event list. Failed to create event set string")
+        print_stderr("ERROR: Empty event list. Failed to create event set string")
         return ""
     end
     local str = ""
@@ -959,7 +979,7 @@ local function setPerfStrings(perflist, cpuexprs)
         local gdata = nil
         gdata = likwid.get_groupdata(perfStr)
         if gdata == nil then
-            print("Cannot get data for group "..perfStr..". Skipping...")
+            print_stderr("Cannot get data for group "..perfStr..". Skipping...")
         else
             table.insert(grouplist, gdata)
             if perfexprs[k] == nil then
@@ -1013,7 +1033,7 @@ local function setPerfStrings(perflist, cpuexprs)
 
             if debug then
                 for i, expr in pairs(perfexprs[k]) do
-                    print(string.format("DEBUG: Process %d measures with event set: %s", i-1, expr))
+                    print_stdout(string.format("DEBUG: Process %d measures with event set: %s", i-1, expr))
                 end
             end
         end
@@ -1072,7 +1092,7 @@ local function writeWrapperScript(scriptname, execStr, hosts, outputname)
         glsize_var = tostring(math.tointeger(np))
         losize_var = "$MPI_LOCALNRANKS"
     else
-        print("Invalid MPI vendor "..mpitype)
+        print_stderr("Invalid MPI vendor "..mpitype)
         return
     end
 
@@ -1086,7 +1106,7 @@ local function writeWrapperScript(scriptname, execStr, hosts, outputname)
 
     local f = io.open(scriptname, "w")
     if f == nil then
-        print("ERROR: Cannot open hostfile "..scriptname)
+        print_stderr("ERROR: Cannot open hostfile "..scriptname)
         os.exit(1)
     end
 
@@ -1128,7 +1148,11 @@ local function writeWrapperScript(scriptname, execStr, hosts, outputname)
     f:write("#!/bin/bash -l\n")
     f:write("GLOBALSIZE="..glsize_var.."\n")
     f:write("GLOBALRANK="..glrank_var.."\n")
-    f:write("unset OMP_NUM_THREADS\n")
+    if os.getenv("OMP_NUM_THREADS") == nil then
+        f:write("unset OMP_NUM_THREADS\n")
+    else
+        f:write(string.format("export OMP_NUM_THREADS=%s\n", os.getenv("OMP_NUM_THREADS")))
+    end
     if mpitype == "intelmpi" then
         f:write("export I_MPI_PIN=disable\n")
     end
@@ -1158,14 +1182,14 @@ local function writeWrapperScript(scriptname, execStr, hosts, outputname)
 
     f:write("if [ \"$LOCALRANK\" -eq 0 ]; then\n")
     if debug then
-        print("NODE_EXEC: "..commands[1])
+        print_stdout("NODE_EXEC: "..commands[1])
     end
     f:write("\t"..commands[1].."\n")
 
     for i=2,#commands do
         f:write("elif [ \"$LOCALRANK\" -eq "..tostring(i-1).." ]; then\n")
         if debug then
-            print("NODE_EXEC: "..commands[i])
+            print_stdout("NODE_EXEC: "..commands[i])
         end
         f:write("\t"..commands[i].."\n")
     end
@@ -1203,7 +1227,7 @@ local function parseOutputFile(filename)
     local results = {}
     local f = io.open(filename, "r")
     if f == nil then
-        print("ERROR: Cannot open output file "..filename)
+        print_stderr("ERROR: Cannot open output file "..filename)
         os.exit(1)
     end
     rank, host = filename:match("output_%d+_(%d+)_(%g+).csv")
@@ -1211,7 +1235,7 @@ local function parseOutputFile(filename)
     local t = f:read("*all")
     f:close()
     if t:len() == 0 then
-        print("Error Output file "..filename.." is empty")
+        print_stderr("Error Output file "..filename.." is empty")
         os.exit(1)
     end
     for i, line in pairs(likwid.stringsplit(t, "\n")) do
@@ -1291,7 +1315,7 @@ local function parseMarkerOutputFile(filename)
     local results = {}
     local f = io.open(filename, "r")
     if f == nil then
-        print("ERROR: Cannot open output file "..filename)
+        print_stderr("ERROR: Cannot open output file "..filename)
         os.exit(1)
     end
     rank, host = filename:match("output_%d+_(%d+)_(%g+).csv")
@@ -1323,7 +1347,7 @@ local function parseMarkerOutputFile(filename)
                 clock = tonumber(clock)*1.E09
             elseif parse_reg_info and line:match("TABLE,Region (%g+),Group (%d+) Raw,(%g+),") then
                 current_region, gidx, gname  = line:match("TABLE,Region (%g+),Group (%d+) Raw,(%g+),")
-                gidx = tonumber(gidx)+1
+                gidx = tonumber(gidx)
                 if results[current_region] == nil then
                     results[current_region] = {}
                 end
@@ -1547,9 +1571,9 @@ function printMpiOutput(group_list, all_results, regionname)
                 if #secondtab_combined > maxLineFields then maxLineFields = #secondtab_combined end
             end
             if region then
-                print("Region,"..tostring(region).. string.rep(",", maxLineFields  - 2))
+                print_stdout("Region,"..tostring(region).. string.rep(",", maxLineFields  - 2))
             end
-            print("Group,"..tostring(gidx) .. string.rep(",", maxLineFields  - 2))
+            print_stdout("Group,"..tostring(gidx) .. string.rep(",", maxLineFields  - 2))
             likwid.printcsv(firsttab, maxLineFields)
             if total_threads > 1 then likwid.printcsv(firsttab_combined, maxLineFields) end
             if gdata["Metrics"] then
@@ -1558,9 +1582,9 @@ function printMpiOutput(group_list, all_results, regionname)
             end
         else
             if region then
-                print("Region: "..tostring(region))
+                print_stdout("Region: "..tostring(region))
             end
-            print("Group: "..tostring(gidx))
+            print_stdout("Group: "..tostring(gidx))
             likwid.printtable(firsttab)
             if total_threads > 1 then likwid.printtable(firsttab_combined) end
             if gdata["Metrics"] then
@@ -1588,8 +1612,8 @@ for opt,arg in likwid.getopt(arg, {"n:","np:", "nperdomain:","pin:","hostfile:",
     if (type(arg) == "string") then
         local s,e = arg:find("-")
         if s == 1 then
-            print(string.format("ERROR: Argmument %s to option -%s starts with invalid character -.", arg, opt))
-            print("ERROR: Did you forget an argument to an option?")
+            print_stderr(string.format("ERROR: Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print_stderr("ERROR: Did you forget an argument to an option?")
             os.exit(1)
         end
     end
@@ -1611,14 +1635,14 @@ for opt,arg in likwid.getopt(arg, {"n:","np:", "nperdomain:","pin:","hostfile:",
     elseif opt == "n" or opt == "np" then
         np = tonumber(arg)
         if np == nil then
-            print("Argument for -n/-np must be a number")
+            print_stderr("Argument for -n/-np must be a number")
             os.exit(1)
         end
     elseif opt == "nperdomain" then
         nperdomain = arg
         local domain, count = nperdomain:match("([NSCM]%d*):(%d+)")
         if domain == nil then
-            print("Invalid option to -nperdomain")
+            print_stderr("Invalid option to -nperdomain")
             os.exit(1)
         end
     elseif opt == "hostfile" then
@@ -1634,33 +1658,41 @@ for opt,arg in likwid.getopt(arg, {"n:","np:", "nperdomain:","pin:","hostfile:",
     elseif opt == "s" or opt == "skip" then
         skipStr = "-s "..arg
     elseif opt == "?" then
-        print("Invalid commandline option -"..arg)
+        print_stderr("Invalid commandline option -"..arg)
         os.exit(1)
     elseif opt == "!" then
-        print("Option requires an argument")
+        print_stderr("Option requires an argument")
         os.exit(1)
     end
 end
 
 
 if np == 0 and nperdomain == nil and #cpuexprs == 0 then
-    print("ERROR: No option -n/-np, -nperdomain or -pin")
+    print_stderr("ERROR: No option -n/-np, -nperdomain or -pin")
     os.exit(1)
 end
 
 if use_marker and #perf == 0 then
-    print("ERROR: You selected the MarkerAPI feature but didn't set any events on the commandline")
+    print_stderr("ERROR: You selected the MarkerAPI feature but didn't set any events on the commandline")
     os.exit(1)
 end
 
+local test_mpiOpts = false
 for i=1,#arg do
-    table.insert(executable, arg[i])
+    if arg[i] == '--' then
+        test_mpiOpts = true
+    end
+    if not test_mpiOpts then
+        table.insert(executable, arg[i])
+    elseif arg[i] ~= '--' then
+        table.insert(mpiopts, arg[i])
+    end
 end
 if #executable == 0 then
-    print("ERROR: No executable given on commandline")
+    print_stderr("ERROR: No executable given on commandline")
     os.exit(1)
 elseif os.execute(string.format("ls %s 1>/dev/null 2>&1", executable[1])) == 0 then
-    print("ERROR: Cannot find executable given on commandline")
+    print_stderr("ERROR: Cannot find executable given on commandline")
     os.exit(1)
 else
     local f = io.popen(string.format("which %s 2>/dev/null", executable[1]))
@@ -1669,35 +1701,38 @@ else
         f:close()
     end
     if debug then
-        print("DEBUG: Executable given on commandline: "..table.concat(executable, " "))
+        print_stdout("DEBUG: Executable given on commandline: "..table.concat(executable, " "))
     end
 end
+if #mpiopts > 0 and debug then
+    print_stdout("DEBUG: MPI options given on commandline: "..table.concat(mpiopts, " "))
+end
 
 if mpitype == nil then
     mpitype = getMpiType()
     if debug then
-        print("DEBUG: Using MPI implementation "..mpitype)
+        print_stdout("DEBUG: Using MPI implementation "..mpitype)
     end
 end
 if mpitype ~= "intelmpi" and mpitype ~= "mvapich2" and mpitype ~= "openmpi" and mpitype ~= "slurm" then
-    print("ERROR: Cannot determine current MPI implementation. likwid-mpirun checks for openmpi, intelmpi and mvapich2 or if running in a SLURM environment")
+    print_stderr("ERROR: Cannot determine current MPI implementation. likwid-mpirun checks for openmpi, intelmpi and mvapich2 or if running in a SLURM environment")
     os.exit(1)
 end
 
 getMpiExec(mpitype)
 if (mpiexecutable == nil) then
-    print(string.format("Cannot find executable for determined MPI implementation %s", mpitype))
+    print_stderr(string.format("Cannot find executable for determined MPI implementation %s", mpitype))
     os.exit(1)
 end
 
 if omptype == nil then
     omptype = getOmpType()
     if debug and omptype ~= nil then
-        print("DEBUG: Using OpenMP implementation "..omptype)
+        print_stdout("DEBUG: Using OpenMP implementation "..omptype)
     end
 end
 if omptype == nil then
-    print("WARN: Cannot extract OpenMP vendor from executable or commandline, assuming no OpenMP")
+    print_stderr("WARN: Cannot extract OpenMP vendor from executable or commandline, assuming no OpenMP")
 end
 
 if not hostfile then
@@ -1720,42 +1755,13 @@ end
 
 local givenNrNodes = getNumberOfNodes(hosts)
 
-if skipStr == "" then
-    if mpitype == "intelmpi" then
-        if omptype == "intel" and givenNrNodes > 1 then
-            skipStr = '-s 0x3'
-        elseif omptype == "intel" and givenNrNodes == 1 then
-            skipStr = '-s 0x1'
-        elseif omptype == "gnu" and givenNrNodes > 1 then
-            skipStr = '-s 0x1'
-        elseif omptype == "gnu" and givenNrNodes == 1 then
-            skipStr = '-s 0x0'
-        end
-    elseif mpitype == "mvapich2" then
-        if omptype == "intel" and givenNrNodes > 1 then
-            skipStr = '-s 0x7'
-        end
-    elseif mpitype == "openmpi" then
-        if omptype == "intel" and givenNrNodes > 1 then
-            skipStr = '-s 0x7'
-        elseif omptype == "intel" and givenNrNodes == 1 then
-            skipStr = '-s 0x1'
-        elseif omptype == "gnu" and givenNrNodes > 1 then
-            skipStr = '-s 0x7'
-        elseif omptype == "gnu" and givenNrNodes == 1 then
-            skipStr = '-s 0x0'
-        end
-    end
-end
-if debug and skipStr ~= "" then
-    print(string.format("DEBUG: Using skip option %s to skip pinning of shepard threads", skipStr))
-end
+
 
 if #perf > 0 then
     local sum_maxslots = 0
     local topo = likwid.getCpuTopology()
     if debug then
-        print("DEBUG: Switch to perfctr mode, there are "..tostring(#perf).." eventsets given on the commandline")
+        print_stdout("DEBUG: Switch to perfctr mode, there are "..tostring(#perf).." eventsets given on the commandline")
     end
     for i, host in pairs(hosts) do
         if debug then
@@ -1763,7 +1769,7 @@ if #perf > 0 then
             if host["maxslots"] ~= nil then
                 str = str .. string.format(" and %d slots maximally", host["maxslots"])
             end
-            print(str)
+            print_stdout(str)
         end
         if host["maxslots"] ~= nil then
             sum_maxslots = sum_maxslots + host["maxslots"]
@@ -1775,33 +1781,45 @@ if #perf > 0 then
         end
     end
     if np > sum_maxslots then
-        print("ERROR: Processes requested exceeds maximally available slots of given hosts. Maximal processes: "..sum_maxslots)
+        print_stderr("ERROR: Processes requested exceeds maximally available slots of given hosts. Maximal processes: "..sum_maxslots)
         os.exit(1)
     end
 end
 
 if #cpuexprs > 0 then
     cpuexprs = calculatePinExpr(cpuexprs)
-    likwid.tableprint(cpuexprs)
-    print(#cpuexprs)
+    if debug then
+        str = "["
+        for i, expr in pairs(cpuexprs) do
+            str = str .. "["..expr.."], "
+        end
+        str = str:sub(1,str:len()-2) .. "]"
+        print_stdout("DEBUG: Evaluated CPU expressions: ".. str)
+    end
     ppn = #cpuexprs
     if np == 0 then
         if debug then
-            print(string.format("DEBUG: No -np given , setting according to pin expression and number of available hosts"))
+            print_stdout(string.format("DEBUG: No -np given , setting according to pin expression and number of available hosts"))
         end
         np = givenNrNodes * #cpuexprs
         ppn = #cpuexprs
     elseif np < #cpuexprs*givenNrNodes then
-        while np < #cpuexprs*givenNrNodes and #cpuexprs > 1 do
-            print("remove")
-            table.remove(cpuexprs)
+        while np < #cpuexprs*givenNrNodes and #hosts > 1 do
+            table.remove(hosts)
+            givenNrNodes = getNumberOfNodes(hosts)
         end
+        if #hosts == 1 and np < #cpuexprs then
+            while np < #cpuexprs do
+                table.remove(cpuexprs)
+            end
+        end
+        np = #cpuexprs*givenNrNodes
         ppn = #cpuexprs
     end
     newhosts = assignHosts(hosts, np, ppn)
     if np > #cpuexprs*#newhosts and #perf > 0 then
-        print("ERROR: Oversubsribing not allowed.")
-        print(string.format("ERROR: You want %d processes but the pinning expression has only expressions for %d processes. There are only %d hosts in the host list.", np, #cpuexprs*#newhosts, #newhosts))
+        print_stderr("ERROR: Oversubsribing not allowed.")
+        print_stderr(string.format("ERROR: You want %d processes but the pinning expression has only expressions for %d processes. There are only %d hosts in the host list.", np, #cpuexprs*#newhosts, #newhosts))
         os.exit(1)
     end
 elseif nperdomain ~= nil then
@@ -1812,18 +1830,18 @@ elseif nperdomain ~= nil then
     end
     if np < ppn then
         if debug then
-            print("WARN: Removing additional cpu expressions to get requested number of processes")
+            print_stderr("WARN: Removing additional cpu expressions to get requested number of processes")
         end
         for i=np+1,ppn do
             if debug then
-                print("WARN: Remove cpuexpr: "..cpuexprs[#cpuexprs])
+                print_stderr("WARN: Remove cpuexpr: "..cpuexprs[#cpuexprs])
             end
             table.remove(cpuexprs, #cpuexprs)
         end
         ppn = np
     elseif np > (givenNrNodes * ppn) and #perf > 0 then
-        print("ERROR: Oversubsribing nodes not allowed!")
-        print(string.format("ERROR: You want %d processes with %d on each of the %d hosts", np, ppn, givenNrNodes))
+        print_stderr("ERROR: Oversubsribing nodes not allowed!")
+        print_stderr(string.format("ERROR: You want %d processes with %d on each of the %d hosts", np, ppn, givenNrNodes))
         os.exit(1)
     end
     newhosts, ppn = assignHosts(hosts, np, ppn)
@@ -1849,10 +1867,10 @@ elseif ppn == 0 and np > 0 then
     end
     if (ppn * givenNrNodes) < np then
         if #perf == 0 then
-            print("ERROR: Processes cannot be equally distributed")
-            print(string.format("WARN: You want %d processes on %d hosts.", np, givenNrNodes))
+            print_stderr("ERROR: Processes cannot be equally distributed")
+            print_stderr(string.format("WARN: You want %d processes on %d hosts.", np, givenNrNodes))
             ppn = np/givenNrNodes
-            print(string.format("WARN: Sanitizing number of processes per node to %d", ppn))
+            print_stderr(string.format("WARN: Sanitizing number of processes per node to %d", ppn))
         else
             ppn = 0
             os.exit(1)
@@ -1890,7 +1908,7 @@ elseif ppn == 0 and np > 0 then
         end
     end
 else
-    print("ERROR: Commandline settings are not supported.")
+    print_stderr("ERROR: Commandline settings are not supported.")
     os.exit(1)
 end
 
@@ -1900,6 +1918,45 @@ if #perf > 0 then
 end
 
 local nrNodes = getNumberOfNodes(newhosts)
+if np > #cpuexprs*nrNodes then
+    np = #cpuexprs*nrNodes
+elseif np < #cpuexprs then
+    while np < #cpuexprs do
+        table.remove(cpuexprs)
+    end
+    ppn = #cpuexprs
+end
+
+if skipStr == "" then
+    if mpitype == "intelmpi" then
+        if omptype == "intel" and nrNodes > 1 then
+            skipStr = '-s 0x3'
+        elseif omptype == "intel" and nrNodes == 1 then
+            skipStr = '-s 0x3'
+        elseif omptype == "gnu" and nrNodes > 1 then
+            skipStr = '-s 0x1'
+        elseif omptype == "gnu" and nrNodes == 1 then
+            skipStr = '-s 0x0'
+        end
+    elseif mpitype == "mvapich2" then
+        if omptype == "intel" and nrNodes > 1 then
+            skipStr = '-s 0x7'
+        end
+    elseif mpitype == "openmpi" then
+        if omptype == "intel" and nrNodes > 1 then
+            skipStr = '-s 0x7'
+        elseif omptype == "intel" and nrNodes == 1 then
+            skipStr = '-s 0x1'
+        elseif omptype == "gnu" and nrNodes > 1 then
+            skipStr = '-s 0x7'
+        elseif omptype == "gnu" and nrNodes == 1 then
+            skipStr = '-s 0x0'
+        end
+    end
+end
+if debug and skipStr ~= "" then
+    print_stdout(string.format("DEBUG: Using skip option %s to skip pinning of shepard threads", skipStr))
+end
 
 local pid = likwid.getpid()
 local hostfilename = string.format(".hostfile_%s.txt", pid)
@@ -1909,14 +1966,14 @@ local outfilename = string.format(os.getenv("PWD").."/.output_%s_%%r_%%h.csv", p
 checkLikwid()
 
 if writeHostfile == nil or getEnvironment == nil or executeCommand == nil then
-    print("ERROR: Initialization for MPI specific functions failed")
+    print_stderr("ERROR: Initialization for MPI specific functions failed")
     os.exit(1)
 end
 
 writeHostfile(newhosts, hostfilename)
 writeWrapperScript(scriptfilename, table.concat(executable, " "), newhosts, outfilename)
 local env = getEnvironment()
-executeCommand(scriptfilename, hostfilename, env, nrNodes)
+local exitvalue = executeCommand(scriptfilename, hostfilename, env, nrNodes)
 
 os.remove(scriptfilename)
 os.remove(hostfilename)
@@ -1957,11 +2014,12 @@ else
         end
     end
     if likwid.tablelength(all_results) > 0 then
-        for reg, _ in pairs(tmpList[0]) do
+        for region, _ in pairs(tmpList[0]) do
             for rank,_ in pairs(all_results) do
-                all_results[rank]["results"] = tmpList[rank][reg]
+                all_results[rank]["results"] = tmpList[rank][region]
             end
-            printMpiOutput(grouplist, all_results, reg)
+            printMpiOutput(grouplist, all_results, region)
         end
     end
 end
+os.exit(exitvalue)
diff --git a/src/applications/likwid-perfctr.lua b/src/applications/likwid-perfctr.lua
index f49ecc7..c35f0fd 100644
--- a/src/applications/likwid-perfctr.lua
+++ b/src/applications/likwid-perfctr.lua
@@ -8,7 +8,7 @@
  *                    on x86 processors
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
@@ -34,45 +34,54 @@ package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
 
 local likwid = require("likwid")
 
+print_stdout = print
+print_stderr = function(...) for k,v in pairs({...}) do io.stderr:write(v .. "\n") end io.stderr:flush() end
+
 local function version()
-    print(string.format("likwid-perfctr --  Version %d.%d",likwid.version,likwid.release))
+    print_stdout(string.format("likwid-perfctr --  Version %d.%d",likwid.version,likwid.release))
 end
 
 local function examples()
-    print("Examples:")
-    print("Run command on CPU 2 and measure performance group TEST:")
-    print("likwid-perfctr -C 2 -g TEST ./a.out")
+    io.stdout:write("Examples:\n")
+    io.stdout:write("List all performance groups:\n")
+    io.stdout:write("likwid-perfctr -a\n")
+    io.stdout:write("List all events and counters:\n")
+    io.stdout:write("likwid-perfctr -e\n")
+    io.stdout:write("List all events and suitable counters for events with 'L2' in them:\n")
+    io.stdout:write("likwid-perfctr -E L2\n")
+    io.stdout:write("Run command on CPU 2 and measure performance group TEST:\n")
+    io.stdout:write("likwid-perfctr -C 2 -g TEST ./a.out\n")
 end
 
 local function usage()
     version()
-    print("A tool to read out performance counter registers on x86 processors\n")
-    print("Options:")
-    print("-h, --help\t\t Help message")
-    print("-v, --version\t\t Version information")
-    print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
-    print("-c <list>\t\t Processor ids to measure (required), e.g. 1,2-4,8")
-    print("-C <list>\t\t Processor ids to pin threads and measure, e.g. 1,2-4,8")
-    print("\t\t\t For information about the <list> syntax, see likwid-pin")
-    print("-g, --group <string>\t Performance group or custom event set string")
-    print("-H\t\t\t Get group help (together with -g switch)")
-    print("-s, --skip <hex>\t Bitmask with threads to skip")
-    print("-M <0|1>\t\t Set how MSR registers are accessed, 0=direct, 1=accessDaemon")
-    print("-a\t\t\t List available performance groups")
-    print("-e\t\t\t List available events and counter registers")
-    print("-E <string>\t\t List available events and corresponding counters that match <string>")
-    print("-i, --info\t\t Print CPU info")
-    print("-T <time>\t\t Switch eventsets with given frequency")
-    print("-f, --force\t\t Force overwrite of registers if they are in use")
-    print("Modes:")
-    print("-S <time>\t\t Stethoscope mode with duration in s, ms or us, e.g 20ms")
-    print("-t <time>\t\t Timeline mode with frequency in s, ms or us, e.g. 300ms")
-    print("-m, --marker\t\t Use Marker API inside code")
-    print("Output options:")
-    print("-o, --output <file>\t Store output to file. (Optional: Apply text filter according to filename suffix)")
-    print("-O\t\t\t Output easily parseable CSV instead of fancy tables")
-    print("--stats\t\t\t Always print statistics table")
-    print("\n")
+    io.stdout:write("A tool to read out performance counter registers on x86 processors\n\n")
+    io.stdout:write("Options:\n")
+    io.stdout:write("-h, --help\t\t Help message\n")
+    io.stdout:write("-v, --version\t\t Version information\n")
+    io.stdout:write("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)\n")
+    io.stdout:write("-c <list>\t\t Processor ids to measure (required), e.g. 1,2-4,8\n")
+    io.stdout:write("-C <list>\t\t Processor ids to pin threads and measure, e.g. 1,2-4,8\n")
+    io.stdout:write("\t\t\t For information about the <list> syntax, see likwid-pin\n")
+    io.stdout:write("-g, --group <string>\t Performance group or custom event set string\n")
+    io.stdout:write("-H\t\t\t Get group help (together with -g switch)\n")
+    io.stdout:write("-s, --skip <hex>\t Bitmask with threads to skip\n")
+    io.stdout:write("-M <0|1>\t\t Set how MSR registers are accessed, 0=direct, 1=accessDaemon\n")
+    io.stdout:write("-a\t\t\t List available performance groups\n")
+    io.stdout:write("-e\t\t\t List available events and counter registers\n")
+    io.stdout:write("-E <string>\t\t List available events and corresponding counters that match <string>\n")
+    io.stdout:write("-i, --info\t\t Print CPU info\n")
+    io.stdout:write("-T <time>\t\t Switch eventsets with given frequency\n")
+    io.stdout:write("-f, --force\t\t Force overwrite of registers if they are in use\n")
+    io.stdout:write("Modes:")
+    io.stdout:write("-S <time>\t\t Stethoscope mode with duration in s, ms or us, e.g 20ms\n")
+    io.stdout:write("-t <time>\t\t Timeline mode with frequency in s, ms or us, e.g. 300ms\n")
+    io.stdout:write("-m, --marker\t\t Use Marker API inside code\n")
+    io.stdout:write("Output options:\n")
+    io.stdout:write("-o, --output <file>\t Store output to file. (Optional: Apply text filter according to filename suffix)\n")
+    io.stdout:write("-O\t\t\t Output easily parseable CSV instead of fancy tables\n")
+    io.stdout:write("--stats\t\t\t Always print statistics table\n")
+    io.stdout:write("\n")
     examples()
 end
 
@@ -113,7 +122,7 @@ use_timeline = false
 daemon_run = 0
 use_wrapper = false
 duration = 2.E06
-switch_interval = 5
+overflow_interval = 2.E06
 output = ""
 use_csv = false
 print_stats = false
@@ -122,7 +131,6 @@ outfile = nil
 forceOverwrite = 0
 gotC = false
 markerFile = string.format("/tmp/likwid_%d.txt",likwid.getpid())
-print_stdout = print
 cpuClock = 1
 likwid.catchSignal()
 
@@ -135,25 +143,55 @@ for opt,arg in likwid.getopt(arg, {"a", "c:", "C:", "e", "E:", "g:", "h", "H", "
     if (type(arg) == "string") then
         local s,e = arg:find("-");
         if s == 1 then
-            print_stdout(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
-            print_stdout("Did you forget an argument to an option?")
+            print_stderr(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print_stderr("Did you forget an argument to an option?")
             os.exit(1)
         end
     end
     if opt == "h" or opt == "help" then
         usage()
+        if outfile ~= nil and likwid.access(outfile..".tmp", "e") == 0 then
+            os.remove(outfile..".tmp")
+        end
         os.exit(0)
     elseif opt == "v" or opt == "version" then
         version()
+        if outfile ~= nil and likwid.access(outfile..".tmp", "e") == 0 then
+            os.remove(outfile..".tmp")
+        end
         os.exit(0)
     elseif opt == "V" or opt == "verbose" then
-        verbose = tonumber(arg)
-        likwid.setVerbosity(verbose)
+        if arg ~= nil and tonumber(arg) ~= nil then
+            verbose = tonumber(arg)
+            likwid.setVerbosity(verbose)
+        else
+            print_stderr("Option requires an argument")
+            if outfile ~= nil and likwid.access(outfile..".tmp", "e") == 0 then
+                os.remove(outfile..".tmp")
+            end
+            os.exit(1)
+        end
     elseif (opt == "c") then
-        num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+        if arg ~= nil then
+            num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+        else
+            print_stderr("Option requires an argument")
+            if outfile ~= nil and likwid.access(outfile..".tmp", "e") == 0 then
+                os.remove(outfile..".tmp")
+            end
+            os.exit(1)
+        end
         gotC = true
     elseif (opt == "C") then
-        num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+        if arg ~= nil then
+            num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+        else
+            print_stderr("Option requires an argument")
+            if outfile ~= nil and likwid.access(outfile..".tmp", "e") == 0 then
+                os.remove(outfile..".tmp")
+            end
+            os.exit(1)
+        end
         pin_cpus = true
         gotC = true
     elseif (opt == "a") then
@@ -161,7 +199,15 @@ for opt,arg in likwid.getopt(arg, {"a", "c:", "C:", "e", "E:", "g:", "h", "H", "
     elseif (opt == "e") then
         print_events = true
     elseif (opt == "E") then
-        print_event = arg
+        if arg ~= nil then
+            print_event = arg
+        else
+            print_stderr("Option requires an argument")
+            if outfile ~= nil and likwid.access(outfile..".tmp", "e") == 0 then
+                os.remove(outfile..".tmp")
+            end
+            os.exit(1)
+        end
     elseif opt == "f" or opt == "force" then
         forceOverwrite = 1
     elseif opt == "g" or opt == "group" then
@@ -173,10 +219,10 @@ for opt,arg in likwid.getopt(arg, {"a", "c:", "C:", "e", "E:", "g:", "h", "H", "
             skip_mask = arg
         else
             if arg:match("[0-9A-F]") then
-                print("Given skip mask looks like hex, sanitizing arg to 0x"..arg)
+                print_stderr("Given skip mask looks like hex, sanitizing arg to 0x"..arg)
                 skip_mask = "0x"..arg
             else
-                print("Skip mask must be given in hex")
+                print_stderr("Skip mask must be given in hex")
             end
         end
     elseif (opt == "M") then
@@ -189,6 +235,9 @@ for opt,arg in likwid.getopt(arg, {"a", "c:", "C:", "e", "E:", "g:", "h", "H", "
         end
         if (access_mode < 0 and access_mode > 1) then
             print_stdout("Access mode must be 0 for direct access and 1 for access daemon")
+            if outfile ~= nil and likwid.access(outfile..".tmp", "e") == 0 then
+                os.remove(outfile..".tmp")
+            end
             os.exit(1)
         end
     elseif opt == "i" or opt == "info" then
@@ -199,12 +248,36 @@ for opt,arg in likwid.getopt(arg, {"a", "c:", "C:", "e", "E:", "g:", "h", "H", "
         use_wrapper = true
     elseif (opt == "S") then
         use_stethoscope = true
-        duration = likwid.parse_time(arg)
+        if arg ~= nil and arg:match("%d+%a?s") then
+            duration = likwid.parse_time(arg)
+        else
+            print_stderr("Option requires an argument")
+            if outfile ~= nil and likwid.access(outfile..".tmp", "e") == 0 then
+                os.remove(outfile..".tmp")
+            end
+            os.exit(1)
+        end
     elseif (opt == "t") then
         use_timeline = true
-        duration = likwid.parse_time(arg)
+        if arg ~= nil and arg:match("%d+%a?s") then
+            duration = likwid.parse_time(arg)
+        else
+            print_stderr("Option requires an argument")
+            if outfile ~= nil and likwid.access(outfile..".tmp", "e") == 0 then
+                os.remove(outfile..".tmp")
+            end
+            os.exit(1)
+        end
     elseif (opt == "T") then
-        duration = likwid.parse_time(arg)
+        if arg ~= nil and arg:match("%d+%a?s") then
+            duration = likwid.parse_time(arg)
+        else
+            print_stderr("Option requires an argument")
+            if outfile ~= nil and likwid.access(outfile..".tmp", "e") == 0 then
+                os.remove(outfile..".tmp")
+            end
+            os.exit(1)
+        end
     elseif opt == "o" or opt == "output" then
         local suffix = ""
         if string.match(arg, "%.") then
@@ -224,10 +297,16 @@ for opt,arg in likwid.getopt(arg, {"a", "c:", "C:", "e", "E:", "g:", "h", "H", "
     elseif (opt == "stats") then
         print_stats = true
     elseif opt == "?" then
-        print("Invalid commandline option -"..arg)
+        print_stderr("Invalid commandline option -"..arg)
+        if outfile ~= nil and likwid.access(outfile..".tmp", "e") == 0 then
+            os.remove(outfile..".tmp")
+        end
         os.exit(1)
     elseif opt == "!" then
-        print("Option requires an argument")
+        print_stderr("Option requires an argument")
+        if outfile ~= nil and likwid.access(outfile..".tmp", "e") == 0 then
+            os.remove(outfile..".tmp")
+        end
         os.exit(1)
     end
 end
@@ -238,12 +317,18 @@ cputopo = likwid.getCpuTopology()
 
 if not likwid.msr_available(access_flags) then
     if access_mode == 1 then
-        print_stdout("MSR device files not available")
-        print_stdout("Please load msr kernel module before retrying")
+        print_stderr("MSR device files not available")
+        print_stderr("Please load msr kernel module before retrying")
+        if likwid.access(outfile..".tmp", "e") == 0 then
+            os.remove(outfile..".tmp")
+        end
         os.exit(1)
     else
-        print_stdout("MSR device files not readable and writeable")
-        print_stdout("Be sure that you have enough permissions to access the MSR files directly")
+        print_stderr("MSR device files not readable and writeable")
+        print_stderr("Be sure that you have enough permissions to access the MSR files directly")
+        if likwid.access(outfile..".tmp", "e") == 0 then
+            os.remove(outfile..".tmp")
+        end
         os.exit(1)
     end
 end
@@ -255,8 +340,11 @@ if num_cpus == 0 and
    not print_groups and
    not print_group_help and
    not print_info then
-    print_stdout("Option -c <list> or -C <list> must be given on commandline")
+    print_stderr("Option -c <list> or -C <list> must be given on commandline")
     usage()
+    if likwid.access(outfile..".tmp", "e") == 0 then
+        os.remove(outfile..".tmp")
+    end
     os.exit(1)
 elseif num_cpus == 0 and
        gotC and
@@ -265,7 +353,10 @@ elseif num_cpus == 0 and
        not print_groups and
        not print_group_help and
        not print_info then
-    print_stdout("CPUs given on commandline are not valid in current environment, maybe it's limited by a cpuset.")
+    print_stderr("CPUs given on commandline are not valid in current environment, maybe it's limited by a cpuset.")
+    if likwid.access(outfile..".tmp", "e") == 0 then
+        os.remove(outfile..".tmp")
+    end
     os.exit(1)
 end
 
@@ -274,7 +365,10 @@ if num_cpus > 0 then
     for i,cpu1 in pairs(cpulist) do
         for j, cpu2 in pairs(cpulist) do
             if i ~= j and cpu1 == cpu2 then
-                print_stdout("List of CPUs is not unique, got two times CPU " .. tostring(cpu1))
+                print_stderr("List of CPUs is not unique, got two times CPU " .. tostring(cpu1))
+                if outfile and likwid.access(outfile..".tmp", "e") == 0 then
+                    os.remove(outfile..".tmp")
+                end
                 os.exit(1)
             end
         end
@@ -396,7 +490,7 @@ if print_group_help == true then
 end
 
 if #event_string_list == 0 and not print_info then
-    print_stdout("Option(s) -g <string> must be given on commandline")
+    print_stderr("Option(s) -g <string> must be given on commandline")
     usage()
     likwid.putTopology()
     likwid.putConfiguration()
@@ -439,12 +533,29 @@ if print_info or verbose > 0 then
     end
 end
 
+if use_marker == true and use_timeline == true then
+    print_stderr("Cannot run Marker API and Timeline mode simultaneously")
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(0)
+elseif use_marker == true and use_stethoscope == true then
+    print_stderr("Cannot run Marker API and Stethoscope mode simultaneously")
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(0)
+elseif use_timeline == true and use_stethoscope == true then
+    print_stderr("Cannot run Timeline and Stethoscope mode simultaneously")
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(0)
+end
+
 if use_stethoscope == false and use_timeline == false and use_marker == false then
     use_wrapper = true
 end
 
 if use_wrapper and likwid.tablelength(arg)-2 == 0 and print_info == false then
-    print_stdout("No Executable can be found on commandline")
+    print_stderr("No Executable can be found on commandline")
     usage()
     likwid.putTopology()
     likwid.putConfiguration()
@@ -453,16 +564,16 @@ end
 
 if use_marker then
     if likwid.access(markerFile, "rw") ~= -1 then
-        print_stdout(string.format("ERROR: MarkerAPI file %s not accessible. Maybe a remaining file of another user.", markerFile))
-        print_stdout("Please purge all MarkerAPI files from /tmp.")
+        print_stderr(string.format("ERROR: MarkerAPI file %s not accessible. Maybe a remaining file of another user.", markerFile))
+        print_stderr("Please purge all MarkerAPI files from /tmp.")
         os.exit(1)
     end
     if not pin_cpus then
-        print_stdout("Warning: The Marker API requires the application to run on the selected CPUs.")
-        print_stdout("Warning: likwid-perfctr pins the application only when using the -C command line option.")
-        print_stdout("Warning: LIKWID assumes that the application does it before the first instrumented code region is started.")
-        print_stdout("Warning: You can use the string in the environment variable LIKWID_THREADS to pin you application to")
-        print_stdout("Warning: to the CPUs specified after the -c command line option.")
+        print_stderr("Warning: The Marker API requires the application to run on the selected CPUs.")
+        print_stderr("Warning: likwid-perfctr pins the application only when using the -C command line option.")
+        print_stderr("Warning: LIKWID assumes that the application does it before the first instrumented code region is started.")
+        print_stderr("Warning: You can use the string in the environment variable LIKWID_THREADS to pin you application to")
+        print_stderr("Warning: to the CPUs specified after the -c command line option.")
     end
 end
 
@@ -475,7 +586,7 @@ if pin_cpus then
     if omp_threads == nil then
         likwid.setenv("OMP_NUM_THREADS",tostring(math.tointeger(num_cpus)))
     elseif num_cpus > tonumber(omp_threads) then
-        print_stdout(string.format("Environment variable OMP_NUM_THREADS already set to %s but %d cpus required", omp_threads,num_cpus))
+        print_stderr(string.format("Environment variable OMP_NUM_THREADS already set to %s but %d cpus required", omp_threads,num_cpus))
     end
     if os.getenv("CILK_NWORKERS") == nil then
         likwid.setenv("CILK_NWORKERS", tostring(math.tointeger(num_cpus)))
@@ -538,7 +649,9 @@ if likwid.init(num_cpus, cpulist) < 0 then
     os.exit(1)
 end
 
-likwid.setenv("LIKWID_FORCE", tostring(forceOverwrite))
+if os.getenv("LIKWID_FORCE") == nil or (forceOverwrite == 1 and os.getenv("LIKWID_FORCE") ~= tostring(forceOverwrite)) then
+    likwid.setenv("LIKWID_FORCE", tostring(forceOverwrite))
+end
 for i, event_string in pairs(event_string_list) do
     if event_string:len() > 0 then
         local gid = likwid.addEventSet(event_string)
@@ -552,7 +665,7 @@ for i, event_string in pairs(event_string_list) do
     end
 end
 if #group_ids == 0 then
-    print("ERROR: No valid eventset given on commandline. Exiting...")
+    print_stderr("ERROR: No valid eventset given on commandline. Exiting...")
     likwid.putTopology()
     likwid.putConfiguration()
     likwid.finalize()
@@ -593,7 +706,7 @@ if use_timeline == true then
     for i, cpu in pairs(cpulist) do
         cores_string = cores_string .. tostring(cpu) .. "|"
     end
-    io.stderr:write("# "..cores_string:sub(1,cores_string:len()-1).."\n")
+    print_stderr("# "..cores_string:sub(1,cores_string:len()-1).."\n")
     for gid, group in pairs(group_list) do
         local strlist = {}
         if group["Metrics"] == nil then
@@ -605,7 +718,7 @@ if use_timeline == true then
                 table.insert(strlist, e["description"])
             end
         end
-        io.stderr:write("# "..table.concat(strlist, "|").."\n")
+        print_stderr("# "..table.concat(strlist, "|").."\n")
     end
 end
 
@@ -613,6 +726,7 @@ end
 
 io.stdout:flush()
 local groupTime = {}
+local exitvalue = 0
 if use_wrapper or use_timeline then
     local start = likwid.startClock()
     local stop = 0
@@ -627,7 +741,7 @@ if use_wrapper or use_timeline then
 
     local ret = likwid.startCounters()
     if ret < 0 then
-        print_stdout(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
+        print_stderr(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
         os.exit(1)
     end
 
@@ -639,7 +753,11 @@ if use_wrapper or use_timeline then
     end
 
     if not pid then
-        print_stdout("Failed to execute command: ".. execString)
+        print_stderr("Failed to execute command: ".. execString)
+        likwid.putTopology()
+        likwid.putNumaInfo()
+        likwid.putConfiguration()
+        os.exit(1)
     end
     start = likwid.startClock()
     groupTime[activeGroup] = 0
@@ -649,13 +767,14 @@ if use_wrapper or use_timeline then
             break
         end
         local remain = likwid.sleep(duration)
-        if remain > 0 or not likwid.checkProgram(pid) then
+        exitvalue = likwid.checkProgram(pid)
+        if remain > 0 or exitvalue >= 0 then
             io.stdout:flush()
             break
         end
         if use_timeline == true then
             stop = likwid.stopClock()
-            likwid.stopCounters()
+            likwid.readCounters()
             
             local time = likwid.getClock(start, stop)
             if likwid.getNumberOfMetrics(activeGroup) == 0 then
@@ -671,7 +790,6 @@ if use_wrapper or use_timeline then
             end
             io.stderr:write(str.."\n")
             groupTime[activeGroup] = time
-            likwid.startCounters()
         else
             likwid.readCounters()
         end
@@ -689,29 +807,30 @@ if use_wrapper or use_timeline then
 elseif use_stethoscope then
     local ret = likwid.startCounters()
     if ret < 0 then
-        print_stdout(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
+        print_stderr(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
         os.exit(1)
     end
     likwid.sleep(duration)
 elseif use_marker then
     local ret = likwid.startCounters()
     if ret < 0 then
-        print_stdout(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
+        print_stderr(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
         os.exit(1)
     end
     local ret = os.execute(execString)
     if ret == nil then
-        print_stdout("Failed to execute command: ".. execString)
+        print_stderr("Failed to execute command: ".. execString)
+        exitvalue = 1
     end
 end
 
 local ret = likwid.stopCounters()
 if ret < 0 then
-    print_stdout(string.format("Error stopping counters for thread %d.",ret * (-1)))
+    print_stderr(string.format("Error stopping counters for thread %d.",ret * (-1)))
     likwid.finalize()
     likwid.putTopology()
     likwid.putConfiguration()
-    os.exit(1)
+    os.exit(exitvalue)
 end
 io.stdout:flush()
 if outfile == nil then
@@ -722,7 +841,7 @@ end
 if use_marker == true then
     results, metrics = likwid.getMarkerResults(markerFile, cpulist)
     if #results == 0 then
-        print_stdout("No regions could be found in Marker API result file")
+        print_stderr("No regions could be found in Marker API result file")
     else
         for r=1, #results do
             likwid.printOutput(results[r], metrics[r], cpulist, r, print_stats)
@@ -745,7 +864,7 @@ if outfile then
     if suffix == "" then
         os.rename(tmpfile, outfile)
     elseif suffix ~= "txt" and suffix ~= "csv" and likwid.access(command, "x") then
-        print_stdout("Cannot find filter script, save output in CSV format to file "..outfile)
+        print_stderr("Cannot find filter script, save output in CSV format to file "..outfile)
         os.rename(tmpfile, outfile)
     else
         if suffix ~= "txt" and suffix ~= "csv" then
@@ -754,10 +873,10 @@ if outfile then
             if f ~= nil then
                 local o = f:read("*a")
                 if o:len() > 0 then
-                    print_stdout(string.format("Failed to executed filter script %s.",command))
+                    print_stderr(string.format("Failed to executed filter script %s.",command))
                 end
             else
-                print_stdout("Failed to call filter script, save output in CSV format to file "..outfile)
+                print_stderr("Failed to call filter script, save output in CSV format to file "..outfile)
                 os.rename(tmpfile, outfile)
                 os.remove(tmpfile)
             end
@@ -772,4 +891,4 @@ likwid.finalize()
 likwid.putTopology()
 likwid.putNumaInfo()
 likwid.putConfiguration()
-os.exit(0)
+os.exit(exitvalue)
diff --git a/src/applications/likwid-perfscope.lua b/src/applications/likwid-perfscope.lua
index c1165a7..faad459 100644
--- a/src/applications/likwid-perfscope.lua
+++ b/src/applications/likwid-perfscope.lua
@@ -8,7 +8,7 @@
  *                    realtime plots using feedGnuplot
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
@@ -34,6 +34,9 @@ package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
 
 local likwid = require("likwid")
 
+print_stdout = print
+print_stderr = function(...) for k,v in pairs({...}) do io.stderr:write(v .. "\n") end end
+
 PERFCTR="<INSTALLED_BINPREFIX>/likwid-perfctr"
 FEEDGNUPLOT="<INSTALLED_BINPREFIX>/feedGnuplot"
 
@@ -118,33 +121,33 @@ local predefined_plots = {
 }
 
 local function version()
-    print(string.format("likwid-perfscope --  Version %d.%d",likwid.version,likwid.release))
+    print_stdout(string.format("likwid-perfscope --  Version %d.%d",likwid.version,likwid.release))
 end
 
 local function examples()
-    print("Examples:")
-    print("Run command on CPU 2 and measure performance group TEST:")
-    print("likwid-perfscope -C 2 -g TEST -f 1s ./a.out")
+    print_stdout("Examples:")
+    print_stdout("Run command on CPU 2 and measure performance group TEST:")
+    print_stdout("likwid-perfscope -C 2 -g TEST -f 1s ./a.out")
 end
 
 local function usage()
     version()
-    print("A tool to generate pictures on-the-fly from likwid-perfctr measurements\n")
-    print("Options:")
-    print("-h, --help\t\t Help message")
-    print("-v, --version\t\t Version information")
-    print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
-    print("-a\t\t\t Print all preconfigured plot configurations for the current system.")
-    print("-c <list>\t\t Processor ids to measure, e.g. 1,2-4,8")
-    print("-C <list>\t\t Processor ids to pin threads and measure, e.g. 1,2-4,8")
-    print("-g, --group <string>\t Preconfigured plot group or custom event set string with plot config. See man page for information.")
-    print("-t, --time <time>\t Frequency in s, ms or us, e.g. 300ms, for the timeline mode of likwid-perfctr")
-    print("-f, --force\t\t Overwrite counter configuration although already in use")
-    print("-d, --dump\t\t Print output as it is send to feedGnuplot.")
-    print("-p, --plotdump\t\t Use dump functionality of feedGnuplot. Plots out plot configurations plus data to directly submit to gnuplot")
-    print("--host <host>\t\t Run likwid-perfctr on the selected host using SSH. Evaluation and plotting is done locally.")
-    print("\t\t\t This can be used for machines that have no gnuplot installed. All paths must be similar to the local machine.")
-    print("\n")
+    print_stdout("A tool to generate pictures on-the-fly from likwid-perfctr measurements\n")
+    print_stdout("Options:")
+    print_stdout("-h, --help\t\t Help message")
+    print_stdout("-v, --version\t\t Version information")
+    print_stdout("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+    print_stdout("-a\t\t\t Print all preconfigured plot configurations for the current system.")
+    print_stdout("-c <list>\t\t Processor ids to measure, e.g. 1,2-4,8")
+    print_stdout("-C <list>\t\t Processor ids to pin threads and measure, e.g. 1,2-4,8")
+    print_stdout("-g, --group <string>\t Preconfigured plot group or custom event set string with plot config. See man page for information.")
+    print_stdout("-t, --time <time>\t Frequency in s, ms or us, e.g. 300ms, for the timeline mode of likwid-perfctr")
+    print_stdout("-f, --force\t\t Overwrite counter configuration although already in use")
+    print_stdout("-d, --dump\t\t Print output as it is send to feedGnuplot.")
+    print_stdout("-p, --plotdump\t\t Use dump functionality of feedGnuplot. Plots out plot configurations plus data to directly submit to gnuplot")
+    print_stdout("--host <host>\t\t Run likwid-perfctr on the selected host using SSH. Evaluation and plotting is done locally.")
+    print_stdout("\t\t\t This can be used for machines that have no gnuplot installed. All paths must be similar to the local machine.")
+    print_stdout("\n")
     examples()
 end
 
@@ -210,7 +213,10 @@ for opt,arg in likwid.getopt(arg, {"h","v","g:","C:","c:","t:","r:","a","d","p",
     elseif opt == "f" or opt == "force" then
         force = true
     elseif opt == "?" then
-        print("Invalid commandline option -"..arg)
+        print_stderr("Invalid commandline option -"..arg)
+        os.exit(1)
+    elseif opt == "!" then
+        print_stderr("Option requires an argument")
         os.exit(1)
     end
 end
@@ -220,21 +226,21 @@ if print_configs then
     for name, config in pairs(predefined_plots) do
         for i,g in pairs(all_groups) do
             if g == config["perfgroup"] then
-                print("Group "..name)
-                print("\tPerfctr group: "..config["perfgroup"])
-                print("\tMatch for metric: "..config["ymetricmatch"])
-                print("\tTitle of plot: "..config["title"])
-                print("\tTitle of x-axis: "..config["xtitle"])
-                print("\tTitle of y-axis: "..config["ytitle"])
+                print_stdout("Group "..name)
+                print_stdout("\tPerfctr group: "..config["perfgroup"])
+                print_stdout("\tMatch for metric: "..config["ymetricmatch"])
+                print_stdout("\tTitle of plot: "..config["title"])
+                print_stdout("\tTitle of x-axis: "..config["xtitle"])
+                print_stdout("\tTitle of y-axis: "..config["ytitle"])
                 if config["y2metricmatch"] then
-                    print("\tMatch for second metric: "..config["y2metricmatch"])
+                    print_stdout("\tMatch for second metric: "..config["y2metricmatch"])
                 end
                 if config["y2title"] then
-                    print("\tTitle of y2-axis: "..config["y2title"])
+                    print_stdout("\tTitle of y2-axis: "..config["y2title"])
                 elseif config["y2metricmatch"] then
-                    print("\tTitle of y2-axis: "..config["ytitle"])
+                    print_stdout("\tTitle of y2-axis: "..config["ytitle"])
                 end
-                print("")
+                print_stdout("")
                 break
             end
         end
@@ -243,17 +249,17 @@ if print_configs then
 end
 
 if not test_gnuplot() then
-    print("GnuPlot not available")
+    print_stderr("GnuPlot not available")
     os.exit(1)
 end
 
 if num_cpus == 0 then
-    print("ERROR: CPU string must be given")
+    print_stderr("ERROR: CPU string must be given")
     os.exit(1)
 end
 
 if #arg == 0 then
-    print("ERROR: Executable must be given on commandline")
+    print_stderr("ERROR: Executable must be given on commandline")
     os.exit(1)
 end
 
@@ -291,7 +297,7 @@ for i, event_def in pairs(eventStrings) do
     local groupdata = nil
     groupdata = likwid.get_groupdata(event_string)
     if groupdata == nil then
-        print("Cannot read event string, it's neither a performance group nor a proper event string <event>:<counter>:<options>,...")
+        print_stderr("Cannot read event string, it's neither a performance group nor a proper event string <event>:<counter>:<options>,...")
         usage()
         os.exit(1)
     end
@@ -491,7 +497,7 @@ for i,g in pairs(group_list) do
     g["output"]:write(str.."\n")
     g["output"]:flush()
     if dump then
-        print(tostring(i).." ".. str)
+        print_stdout(tostring(i).." ".. str)
     end
 end
 
@@ -539,7 +545,7 @@ while true do
         group_list[group]["output"]:write(str.."\n")
         group_list[group]["output"]:flush()
         if dump then
-            print(tostring(group).." ".. str)
+            print_stdout(tostring(group).." ".. str)
         end
         oldtime = time
     end
diff --git a/src/applications/likwid-pin.lua b/src/applications/likwid-pin.lua
index de57652..93b8ac2 100644
--- a/src/applications/likwid-pin.lua
+++ b/src/applications/likwid-pin.lua
@@ -7,7 +7,7 @@
  *      Description:  An application to pin a program including threads
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
@@ -32,57 +32,60 @@ package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
 
 local likwid = require("likwid")
 
+print_stdout = print
+print_stderr = function(...) for k,v in pairs({...}) do io.stderr:write(v .. "\n") end end
+
 local function version()
-    print(string.format("likwid-pin.lua --  Version %d.%d",likwid.version,likwid.release))
+    print_stdout(string.format("likwid-pin.lua --  Version %d.%d",likwid.version,likwid.release))
 end
 
 local function examples()
-    print("Examples:")
-    print("There are three possibilities to provide a thread to processor list:")
-    print("1. Thread list with physical thread IDs")
-    print("Example: likwid-pin.lua -c 0,4-6 ./myApp")
-    print("Pins the application to cores 0,4,5 and 6")
-    print("2. Thread list with logical thread numberings in physical cores first sorted list.")
-    print("Example usage thread list: likwid-pin.lua -c N:0,4-6 ./myApp")
-    print("You can pin with the following numberings:")
-    print("\t2. Logical numbering inside node.\n\t   e.g. -c N:0,1,2,3 for the first 4 physical cores of the node")
-    print("\t3. Logical numbering inside socket.\n\t   e.g. -c S0:0-1 for the first 2 physical cores of the socket")
-    print("\t4. Logical numbering inside last level cache group.\n\t   e.g. -c C0:0-3  for the first 4 physical cores in the first LLC")
-    print("\t5. Logical numbering inside NUMA domain.\n\t   e.g. -c M0:0-3 for the first 4 physical cores in the first NUMA domain")
-    print("\tYou can also mix domains separated by  @,\n\te.g. -c S0:0-3 at S1:0-3 for the 4 first physical cores on both sockets.")
-    print("3. Expressions based thread list generation with compact processor numbering.")
-    print("Example usage expression: likwid-pin.lua -c E:N:8 ./myApp")
-    print("This will generate a compact list of thread to processor mapping for the node domain")
-    print("with eight threads.")
-    print("The following syntax variants are available:")
-    print("\t1. -c E:<thread domain>:<number of threads>")
-    print("\t2. -c E:<thread domain>:<number of threads>:<chunk size>:<stride>")
-    print("\tFor two SMT threads per core on a SMT 4 machine use e.g. -c E:N:122:2:4")
-    print("4. Scatter policy among thread domain type.")
-    print("Example usage scatter: likwid-pin.lua -c M:scatter ./myApp")
-    print("This will generate a thread to processor mapping scattered among all memory domains")
-    print("with physical cores first.")
-    print("")
-    print("likwid-pin sets OMP_NUM_THREADS with as many threads as specified")
-    print("in your pin expression if OMP_NUM_THREADS is not present in your environment.")
+    print_stdout("Examples:")
+    print_stdout("There are three possibilities to provide a thread to processor list:")
+    print_stdout("1. Thread list with physical thread IDs")
+    print_stdout("Example: likwid-pin.lua -c 0,4-6 ./myApp")
+    print_stdout("Pins the application to cores 0,4,5 and 6")
+    print_stdout("2. Thread list with logical thread numberings in physical cores first sorted list.")
+    print_stdout("Example usage thread list: likwid-pin.lua -c N:0,4-6 ./myApp")
+    print_stdout("You can pin with the following numberings:")
+    print_stdout("\t2. Logical numbering inside node.\n\t   e.g. -c N:0,1,2,3 for the first 4 physical cores of the node")
+    print_stdout("\t3. Logical numbering inside socket.\n\t   e.g. -c S0:0-1 for the first 2 physical cores of the socket")
+    print_stdout("\t4. Logical numbering inside last level cache group.\n\t   e.g. -c C0:0-3  for the first 4 physical cores in the first LLC")
+    print_stdout("\t5. Logical numbering inside NUMA domain.\n\t   e.g. -c M0:0-3 for the first 4 physical cores in the first NUMA domain")
+    print_stdout("\tYou can also mix domains separated by  @,\n\te.g. -c S0:0-3 at S1:0-3 for the 4 first physical cores on both sockets.")
+    print_stdout("3. Expressions based thread list generation with compact processor numbering.")
+    print_stdout("Example usage expression: likwid-pin.lua -c E:N:8 ./myApp")
+    print_stdout("This will generate a compact list of thread to processor mapping for the node domain")
+    print_stdout("with eight threads.")
+    print_stdout("The following syntax variants are available:")
+    print_stdout("\t1. -c E:<thread domain>:<number of threads>")
+    print_stdout("\t2. -c E:<thread domain>:<number of threads>:<chunk size>:<stride>")
+    print_stdout("\tFor two SMT threads per core on a SMT 4 machine use e.g. -c E:N:122:2:4")
+    print_stdout("4. Scatter policy among thread domain type.")
+    print_stdout("Example usage scatter: likwid-pin.lua -c M:scatter ./myApp")
+    print_stdout("This will generate a thread to processor mapping scattered among all memory domains")
+    print_stdout("with physical cores first.")
+    print_stdout("")
+    print_stdout("likwid-pin sets OMP_NUM_THREADS with as many threads as specified")
+    print_stdout("in your pin expression if OMP_NUM_THREADS is not present in your environment.")
 end
 
 local function usage()
     version()
-    print("An application to pin a program including threads.\n")
-    print("Options:")
-    print("-h, --help\t\t Help message")
-    print("-v, --version\t\t Version information")
-    print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
-    print("-i\t\t\t Set numa interleave policy with all involved numa nodes")
-    print("-S, --sweep\t\t Sweep memory and LLC of involved NUMA nodes")
-    print("-c <list>\t\t Comma separated processor IDs or expression")
-    print("-s, --skip <hex>\t Bitmask with threads to skip")
-    print("-p\t\t\t Print available domains with mapping on physical IDs")
-    print("\t\t\t If used together with -p option outputs a physical processor IDs.")
-    print("-d <string>\t\t Delimiter used for using -p to output physical processor list, default is comma.")
-    print("-q, --quiet\t\t Silent without output")
-    print("\n")
+    print_stdout("An application to pin a program including threads.\n")
+    print_stdout("Options:")
+    print_stdout("-h, --help\t\t Help message")
+    print_stdout("-v, --version\t\t Version information")
+    print_stdout("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+    print_stdout("-i\t\t\t Set numa interleave policy with all involved numa nodes")
+    print_stdout("-S, --sweep\t\t Sweep memory and LLC of involved NUMA nodes")
+    print_stdout("-c <list>\t\t Comma separated processor IDs or expression")
+    print_stdout("-s, --skip <hex>\t Bitmask with threads to skip")
+    print_stdout("-p\t\t\t Print available domains with mapping on physical IDs")
+    print_stdout("\t\t\t If used together with -p option outputs a physical processor IDs.")
+    print_stdout("-d <string>\t\t Delimiter used for using -p to output physical processor list, default is comma.")
+    print_stdout("-q, --quiet\t\t Silent without output")
+    print_stdout("\n")
     examples()
 end
 
@@ -96,6 +99,7 @@ skip_mask = nil
 affinity = nil
 num_threads = 0
 
+
 config = likwid.getConfiguration()
 cputopo = likwid.getCpuTopology()
 affinity = likwid.getAffinityInfo()
@@ -128,7 +132,7 @@ for opt,arg in likwid.getopt(arg, {"c:", "d:", "h", "i", "p", "q", "s:", "S", "t
             num_threads,cpu_list = likwid.cpustr_to_cpulist_physical(arg)
         end
         if (num_threads == 0) then
-            print("Failed to parse cpulist " .. arg)
+            print_stderr("Failed to parse cpulist " .. arg)
             likwid.putTopology()
             likwid.putAffinityInfo()
             likwid.putConfiguration()
@@ -138,7 +142,7 @@ for opt,arg in likwid.getopt(arg, {"c:", "d:", "h", "i", "p", "q", "s:", "S", "t
         delimiter = arg
     elseif opt == "S" or opt == "sweep" then
         if (affinity == nil) then
-            print("Option -S is not supported for unknown processor!")
+            print_stderr("Option -S is not supported for unknown processor!")
             likwid.putTopology()
             likwid.putAffinityInfo()
             likwid.putConfiguration()
@@ -152,7 +156,7 @@ for opt,arg in likwid.getopt(arg, {"c:", "d:", "h", "i", "p", "q", "s:", "S", "t
     elseif opt == "s" or opt == "skip" then
         local s,e = arg:find("0x")
         if s == nil then
-            print("Skip mask must be given in hex, hence start with 0x")
+            print_stderr("Skip mask must be given in hex, hence start with 0x")
             os.exit(1)
         end
         skip_mask = arg
@@ -160,13 +164,13 @@ for opt,arg in likwid.getopt(arg, {"c:", "d:", "h", "i", "p", "q", "s:", "S", "t
         likwid.setenv("LIKWID_SILENT","true")
         quiet = 1
     elseif opt == "?" then
-        print("Invalid commandline option -"..arg)
+        print_stderr("Invalid commandline option -"..arg)
         likwid.putTopology()
         likwid.putAffinityInfo()
         likwid.putConfiguration()
         os.exit(1)
     elseif opt == "!" then
-        print("Option requires an argument")
+        print_stderr("Option requires an argument")
         likwid.putTopology()
         likwid.putAffinityInfo()
         likwid.putConfiguration()
@@ -180,16 +184,16 @@ if print_domains and num_threads > 0 then
     for i, cpu in pairs(cpu_list) do
         outstr = outstr .. delimiter .. cpu
     end
-    print(outstr:sub(2,outstr:len()))
+    print_stdout(outstr:sub(2,outstr:len()))
     likwid.putTopology()
     likwid.putAffinityInfo()
     likwid.putConfiguration()
     os.exit(0)
 elseif print_domains then
     for k,v in pairs(affinity["domains"]) do
-        print(string.format("Domain %s:", v["tag"]))
-        print("\t" .. table.concat(v["processorList"], ","))
-        print("")
+        print_stdout(string.format("Domain %s:", v["tag"]))
+        print_stdout("\t" .. table.concat(v["processorList"], ","))
+        print_stdout("")
     end
     likwid.putTopology()
     likwid.putAffinityInfo()
@@ -201,25 +205,29 @@ if num_threads == 0 then
     num_threads, cpu_list = likwid.cpustr_to_cpulist("N:0-"..cputopo["numHWThreads"]-1)
 end
 if (#arg == 0) then
-    print("Executable must be given on commandline")
+    print_stderr("Executable must be given on commandline")
     os.exit(1)
 end
 
 if interleaved_policy then
-    print("Set mem_policy to interleaved")
+    if quiet == 0 then
+        print_stdout("Set mem_policy to interleaved")
+    end
     likwid.setMemInterleaved(num_threads, cpu_list)
 end
 
 if sweep_sockets then
-    print("Sweeping memory")
+    if quiet == 0 then
+        print_stdout("Sweeping memory")
+    end
     likwid.memSweep(num_threads, cpu_list)
 end
 
 local omp_threads = os.getenv("OMP_NUM_THREADS")
 if omp_threads == nil then
     likwid.setenv("OMP_NUM_THREADS",tostring(math.tointeger(num_threads)))
-elseif num_threads > tonumber(omp_threads) then
-    print(string.format("Environment variable OMP_NUM_THREADS already set to %s but %d cpus required", omp_threads,num_threads))
+elseif num_threads > tonumber(omp_threads) and quiet == 0 then
+    print_stdout(string.format("Environment variable OMP_NUM_THREADS already set to %s but %d cpus required", omp_threads,num_threads))
 end
 
 likwid.setenv("KMP_AFFINITY","disabled")
@@ -260,16 +268,16 @@ end
 local exec = table.concat(arg," ",1, likwid.tablelength(arg)-2)
 local pid = likwid.startProgram(exec, num_threads, cpu_list)
 if (pid == nil) then
-    print("Failed to execute command: ".. exec)
+    print_stderr("Failed to execute command: ".. exec)
     likwid.putTopology()
     likwid.putAffinityInfo()
     likwid.putConfiguration()
     os.exit(1)
 end
 
-likwid.waitpid(pid)
+local exitvalue = likwid.waitpid(pid)
 
 likwid.putAffinityInfo()
 likwid.putTopology()
 likwid.putConfiguration()
-os.exit(0)
+os.exit(exitvalue)
diff --git a/src/applications/likwid-powermeter.lua b/src/applications/likwid-powermeter.lua
index 3aa742f..813fec3 100644
--- a/src/applications/likwid-powermeter.lua
+++ b/src/applications/likwid-powermeter.lua
@@ -8,7 +8,7 @@
  *      consumption on architectures implementing the RAPL interface.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
@@ -33,34 +33,37 @@ package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
 
 local likwid = require("likwid")
 
+print_stdout = print
+print_stderr = function(...) for k,v in pairs({...}) do io.stderr:write(v .. "\n") end end
+
 local function version()
-    print(string.format("likwid-powermeter --  Version %d.%d",likwid.version,likwid.release))
+    print_stdout(string.format("likwid-powermeter --  Version %d.%d",likwid.version,likwid.release))
 end
 
 local function examples()
-    print("Examples:")
-    print("Measure the power consumption for 4 seconds on socket 1")
-    print("likwid-powermeter -s 4 -c 1")
-    print("")
-    print("Use it as wrapper for an application to measure the energy for the whole execution")
-    print("likwid-powermeter -c 1 ./a.out")
+    print_stdout("Examples:")
+    print_stdout("Measure the power consumption for 4 seconds on socket 1")
+    print_stdout("likwid-powermeter -s 4 -c 1")
+    print_stdout("")
+    print_stdout("Use it as wrapper for an application to measure the energy for the whole execution")
+    print_stdout("likwid-powermeter -c 1 ./a.out")
 end
 
 local function usage()
     version()
-    print("A tool to print power and clocking information on x86 CPUs.\n")
-    print("Options:")
-    print("-h, --help\t Help message")
-    print("-v, --version\t Version information")
-    print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
-    print("-M <0|1>\t\t Set how MSR registers are accessed, 0=direct, 1=accessDaemon")
-    print("-c <list>\t\t Specify sockets to measure")
-    print("-i, --info\t Print information from MSR_PKG_POWER_INFO register and Turbo mode")
-    print("-s <duration>\t Set measure duration in us, ms or s. (default 2s)")
-    print("-p\t\t Print dynamic clocking and CPI values, uses likwid-perfctr")
-    print("-t\t\t Print current temperatures of all CPU cores")
-    print("-f\t\t Print current temperatures in Fahrenheit")
-    print("")
+    print_stdout("A tool to print power and clocking information on x86 CPUs.\n")
+    print_stdout("Options:")
+    print_stdout("-h, --help\t Help message")
+    print_stdout("-v, --version\t Version information")
+    print_stdout("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+    print_stdout("-M <0|1>\t\t Set how MSR registers are accessed, 0=direct, 1=accessDaemon")
+    print_stdout("-c <list>\t\t Specify sockets to measure")
+    print_stdout("-i, --info\t Print information from MSR_PKG_POWER_INFO register and Turbo mode")
+    print_stdout("-s <duration>\t Set measure duration in us, ms or s. (default 2s)")
+    print_stdout("-p\t\t Print dynamic clocking and CPI values, uses likwid-perfctr")
+    print_stdout("-t\t\t Print current temperatures of all CPU cores")
+    print_stdout("-f\t\t Print current temperatures in Fahrenheit")
+    print_stdout("")
     examples()
 end
 
@@ -92,8 +95,8 @@ for opt,arg in likwid.getopt(arg, {"V:", "c:", "h", "i", "M:", "p", "s:", "v", "
     if (type(arg) == "string") then
         local s,e = arg:find("-");
         if s == 1 then
-            print(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
-            print("Did you forget an argument to an option?")
+            print_stderr(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print_stderr("Did you forget an argument to an option?")
             os.exit(1)
         end
     end
@@ -111,11 +114,11 @@ for opt,arg in likwid.getopt(arg, {"V:", "c:", "h", "i", "M:", "p", "s:", "v", "
     elseif (opt == "M") then
         access_mode = tonumber(arg)
         if (access_mode == nil) then
-            print("Access mode (-M) must be an number")
+            print_stderr("Access mode (-M) must be an number")
             usage()
             os.exit(1)
         elseif (access_mode < 0) or (access_mode > 1) then
-            print(string.format("Access mode (-M) %d not valid.",access_mode))
+            print_stderr(string.format("Access mode (-M) %d not valid.",access_mode))
             usage()
             os.exit(1)
         end
@@ -137,10 +140,10 @@ for opt,arg in likwid.getopt(arg, {"V:", "c:", "h", "i", "M:", "p", "s:", "v", "
         time_orig = arg
         stethoscope = true
     elseif opt == "?" then
-        print("Invalid commandline option -"..arg)
+        print_stderr("Invalid commandline option -"..arg)
         os.exit(1)
     elseif opt == "!" then
-        print("Option requires an argument")
+        print_stderr("Option requires an argument")
         os.exit(1)
     end
 end
@@ -187,37 +190,37 @@ end
 
 power = likwid.getPowerInfo()
 if not power then
-    print(string.format("The %s does not support reading power data",cpuinfo["name"]))
+    print_stderr(string.format("The %s does not support reading power data",cpuinfo["name"]))
     os.exit(1)
 end
 
 
 if not use_perfctr then
-    print(likwid.hline);
-    print(string.format("CPU name:\t%s",cpuinfo["osname"]))
-    print(string.format("CPU type:\t%s",cpuinfo["name"]))
+    print_stdout(likwid.hline);
+    print_stdout(string.format("CPU name:\t%s",cpuinfo["osname"]))
+    print_stdout(string.format("CPU type:\t%s",cpuinfo["name"]))
     if cpuinfo["clock"] > 0 then
-        print(string.format("CPU clock:\t%3.2f GHz",cpuinfo["clock"] *  1.E-09))
+        print_stdout(string.format("CPU clock:\t%3.2f GHz",cpuinfo["clock"] *  1.E-09))
     else
-        print(string.format("CPU clock:\t%3.2f GHz",likwid.getCpuClock() *  1.E-09))
+        print_stdout(string.format("CPU clock:\t%3.2f GHz",likwid.getCpuClock() *  1.E-09))
     end
-    print(likwid.hline)
+    print_stdout(likwid.hline)
 end
 
 if print_info or verbose > 0 then
     if (power["turbo"]["numSteps"] > 0) then
-        print(string.format("Base clock:\t%.2f MHz", power["baseFrequency"]))
-        print(string.format("Minimal clock:\t%.2f MHz", power["minFrequency"]))
-        print("Turbo Boost Steps:")
+        print_stdout(string.format("Base clock:\t%.2f MHz", power["baseFrequency"]))
+        print_stdout(string.format("Minimal clock:\t%.2f MHz", power["minFrequency"]))
+        print_stdout("Turbo Boost Steps:")
         for i,step in pairs(power["turbo"]["steps"]) do
-            print(string.format("C%d %.2f MHz",i-1,power["turbo"]["steps"][i]))
+            print_stdout(string.format("C%d %.2f MHz",i-1,power["turbo"]["steps"][i]))
         end
     end
-    print(likwid.hline)
+    print_stdout(likwid.hline)
 end
 
 if power["hasRAPL"] == 0 then
-    print("Measuring power is not supported on this machine")
+    print_stderr("Measuring power is not supported on this machine")
     os.exit(1)
 end
 
@@ -225,19 +228,19 @@ if (print_info) then
     for i, dname in pairs(domainList) do
         local domain = power["domains"][dname]
         if domain["supportInfo"] then
-            print(string.format("Info for RAPL domain %s:", dname));
-            print(string.format("Thermal Spec Power: %g Watt",domain["tdp"]*1E-6))
-            print(string.format("Minimum Power: %g Watt",domain["minPower"]*1E-6))
-            print(string.format("Maximum Power: %g Watt",domain["maxPower"]*1E-6))
-            print(string.format("Maximum Time Window: %g micro sec",domain["maxTimeWindow"]))
-            print()
+            print_stdout(string.format("Info for RAPL domain %s:", dname));
+            print_stdout(string.format("Thermal Spec Power: %g Watt",domain["tdp"]*1E-6))
+            print_stdout(string.format("Minimum Power: %g Watt",domain["minPower"]*1E-6))
+            print_stdout(string.format("Maximum Power: %g Watt",domain["maxPower"]*1E-6))
+            print_stdout(string.format("Maximum Time Window: %g micro sec",domain["maxTimeWindow"]))
+            print_stdout()
         end
     end
-    print(likwid.hline)
+    print_stdout(likwid.hline)
 end
 
 if (stethoscope) and (time_interval < power["timeUnit"]) then
-    print("Time interval too short, minimum measurement time is "..tostring(power["timeUnit"]).. " us")
+    print_stderr("Time interval too short, minimum measurement time is "..tostring(power["timeUnit"]).. " us")
     os.exit(1)
 end
 
@@ -270,6 +273,7 @@ else
     end
 end
 
+local exitvalue = 0
 if not print_info and not print_temp then
     if stethoscope or (#arg > 0 and not use_perfctr) then
         for i,socket in pairs(sockets) do
@@ -301,7 +305,7 @@ if not print_info and not print_temp then
         else
             local pid = likwid.startProgram(execString, 0, {})
             if not pid then
-                print(string.format("Failed to execute %s!",execString))
+                print_stderr(string.format("Failed to execute %s!",execString))
                 likwid.finalize()
                 os.exit(1)
             end
@@ -317,7 +321,8 @@ if not print_info and not print_temp then
                         if (power["domains"][dom]["supportStatus"]) then after[cpu][dom] = likwid.stopPower(cpu, idx) end
                     end
                 end
-                if remain > 0 or not likwid.checkProgram(pid) then
+                exitvalue = likwid.checkProgram(pid)
+                if remain > 0 or exitvalue >= 0 then
                     io.stdout:flush()
                     break
                 end
@@ -333,27 +338,27 @@ if not print_info and not print_temp then
         end
         runtime = likwid.getClock(time_before, time_after)
 
-        print(likwid.hline)
-        print(string.format("Runtime: %g s",runtime))
+        print_stdout(likwid.hline)
+        print_stdout(string.format("Runtime: %g s",runtime))
 
         for i,socket in pairs(sockets) do
             cpu = cpulist[i]
-            print(string.format("Measure for socket %d on CPU %d", socket,cpu ))
+            print_stdout(string.format("Measure for socket %d on CPU %d", socket,cpu ))
             for j, dom in pairs(domainList) do
                 if power["domains"][dom]["supportStatus"] then
-                    local energy = likwid.calcPower(before[cpu][dom], after[cpu][dom], 0)
-                    print(string.format("Domain %s:", dom))
-                    print(string.format("Energy consumed: %g Joules",energy))
-                    print(string.format("Power consumed: %g Watt",energy/runtime))
+                    local energy = likwid.calcPower(before[cpu][dom], after[cpu][dom], j-1)
+                    print_stdout(string.format("Domain %s:", dom))
+                    print_stdout(string.format("Energy consumed: %g Joules",energy))
+                    print_stdout(string.format("Power consumed: %g Watt",energy/runtime))
                 end
             end
-            if i < #sockets then print("") end
+            if i < #sockets then print_stdout("") end
         end
-        print(likwid.hline)
+        print_stdout(likwid.hline)
     else
         err = os.execute(execString)
         if err == false then
-            print(string.format("Failed to execute %s!",execString))
+            print_stderr(string.format("Failed to execute %s!",execString))
             likwid.putPowerInfo()
             likwid.finalize()
             os.exit(1)
@@ -362,8 +367,8 @@ if not print_info and not print_temp then
 end
 
 if print_temp and (string.find(cpuinfo["features"],"TM2") ~= nil) then
-    print(likwid.hline)
-    print("Current core temperatures:");
+    print_stdout(likwid.hline)
+    print_stdout("Current core temperatures:");
     for i=1,cputopo["numSockets"] do
         local tag = "S" .. tostring(i-1)
         for _, domain in pairs(affinity["domains"]) do
@@ -373,16 +378,17 @@ if print_temp and (string.find(cpuinfo["features"],"TM2") ~= nil) then
                     likwid.initTemp(cpuid);
                     if (fahrenheit) then
                         local f = 1.8*tonumber(likwid.readTemp(cpuid))+32
-                        print(string.format("Socket %d Core %d: %.0f F",i-1,cpuid, f));
+                        print_stdout(string.format("Socket %d Core %d: %.0f F",i-1,cpuid, f));
                     else
-                        print(string.format("Socket %d Core %d: %.0f C",i-1,cpuid, tonumber(likwid.readTemp(cpuid))));
+                        print_stdout(string.format("Socket %d Core %d: %.0f C",i-1,cpuid, tonumber(likwid.readTemp(cpuid))));
                     end
                 end
             end
         end
     end
-    print(likwid.hline)
+    print_stdout(likwid.hline)
 end
 
 likwid.putPowerInfo()
 likwid.finalize()
+os.exit(exitvalue)
diff --git a/src/applications/likwid-setFrequencies.lua b/src/applications/likwid-setFrequencies.lua
index 7a56921..1599320 100644
--- a/src/applications/likwid-setFrequencies.lua
+++ b/src/applications/likwid-setFrequencies.lua
@@ -7,7 +7,7 @@
  *      Description:  A application to set the CPU frequency of CPU cores and domains.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
@@ -33,27 +33,30 @@ package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
 
 local likwid = require("likwid")
 
+print_stdout = print
+print_stderr = function(...) for k,v in pairs({...}) do io.stderr:write(v .. "\n") end end
+
 sys_base_path = "/sys/devices/system/cpu"
 set_command = "<INSTALLED_PREFIX>/sbin/likwid-setFreq"
 
 
 function version()
-    print(string.format("likwid-setFrequencies --  Version %d.%d",likwid.version,likwid.release))
+    print_stdout(string.format("likwid-setFrequencies --  Version %d.%d",likwid.version,likwid.release))
 end
 
 function usage()
     version()
-    print("A tool to adjust frequencies and governors on x86 CPUs.\n")
-    print("Options:")
-    print("-h\t Help message")
-    print("-v\t Version information")
-    print("-c dom\t Likwid thread domain which to apply settings (default are all CPUs)")
-    print("\t See likwid-pin -h for details")
-    print("-g gov\t Set governor (" .. table.concat(getAvailGovs(nil), ", ") .. ") (set to ondemand if omitted)")
-    print("-f freq\t Set fixed frequency, implicitly sets userspace governor")
-    print("-p\t Print current frequencies")
-    print("-l\t List available frequencies")
-    print("-m\t List available governors")
+    print_stdout("A tool to adjust frequencies and governors on x86 CPUs.\n")
+    print_stdout("Options:")
+    print_stdout("-h\t Help message")
+    print_stdout("-v\t Version information")
+    print_stdout("-c dom\t Likwid thread domain which to apply settings (default are all CPUs)")
+    print_stdout("\t See likwid-pin -h for details")
+    print_stdout("-g gov\t Set governor (" .. table.concat(getAvailGovs(nil), ", ") .. ") (set to ondemand if omitted)")
+    print_stdout("-f freq\t Set fixed frequency, implicitly sets userspace governor")
+    print_stdout("-p\t Print current frequencies")
+    print_stdout("-l\t List available frequencies")
+    print_stdout("-m\t List available governors")
 end
 
 function getCurrentMinFreq(cpuid)
@@ -62,7 +65,7 @@ function getCurrentMinFreq(cpuid)
         for cpuid=0,topo["numHWThreads"]-1 do
             fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq")
             if verbosity == 3 then
-                print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq" )
+                print_stdout("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq" )
             end
             line = fp:read("*l")
             if tonumber(line)/1E6 < min then
@@ -73,7 +76,7 @@ function getCurrentMinFreq(cpuid)
     else
         fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq")
         if verbosity == 3 then
-            print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq" )
+            print_stdout("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq" )
         end
         line = fp:read("*l")
         if tonumber(line)/1E6 < min then
@@ -90,7 +93,7 @@ function getCurrentMaxFreq(cpuid)
         for cpuid=0,topo["numHWThreads"]-1 do
             fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq")
             if verbosity == 3 then
-                print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq" )
+                print_stdout("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq" )
             end
             line = fp:read("*l")
             if tonumber(line)/1E6 > max then
@@ -101,7 +104,7 @@ function getCurrentMaxFreq(cpuid)
     else
         fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq")
         if verbosity == 3 then
-            print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq" )
+            print_stdout("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq" )
         end
         line = fp:read("*l")
         if tonumber(line)/1E6 > max then
@@ -122,7 +125,7 @@ function getAvailFreq(cpuid)
     end
     fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_frequencies")
     if verbosity == 3 then
-        print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_frequencies" )
+        print_stdout("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_frequencies" )
     end
     line = fp:read("*l")
     fp:close()
@@ -140,7 +143,7 @@ function getAvailFreq(cpuid)
         j = j + 1
     end
     if verbosity == 1 then
-        print(string.format("The system provides %d scaling frequencies, frequency %s is taken as turbo mode", #avail,turbo))
+        print_stdout(string.format("The system provides %d scaling frequencies, frequency %s is taken as turbo mode", #avail,turbo))
     end
     return avail, tostring(turbo)
 end
@@ -151,7 +154,7 @@ function getCurFreq()
     for cpuid=0,topo["numHWThreads"]-1 do
         local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_cur_freq")
         if verbosity == 3 then
-            print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_cur_freq" )
+            print_stdout("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_cur_freq" )
         end
         local line = fp:read("*l")
         fp:close()
@@ -161,7 +164,7 @@ function getCurFreq()
         end
         local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_governor")
         if verbosity == 3 then
-            print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_governor" )
+            print_stdout("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_governor" )
         end
         local line = fp:read("*l")
         fp:close()
@@ -176,7 +179,7 @@ function getAvailGovs(cpuid)
     end
     local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_governors")
     if verbosity == 3 then
-        print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_governors" )
+        print_stdout("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_governors" )
     end
     local line = fp:read("*l")
     fp:close()
@@ -189,7 +192,7 @@ function getAvailGovs(cpuid)
     end
     table.insert(avail, "turbo")
     if verbosity == 1 then
-        print(string.format("The system provides %d scaling governors", #avail))
+        print_stdout(string.format("The system provides %d scaling governors", #avail))
     end
     return avail
 end
@@ -197,7 +200,7 @@ end
 local function testDriver()
     local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",0) .. "/cpufreq/scaling_driver")
     if verbosity == 3 then
-        print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",0) .. "/cpufreq/scaling_driver" )
+        print_stdout("Reading "..sys_base_path .. "/" .. string.format("cpu%d",0) .. "/cpufreq/scaling_driver" )
     end
     local line = fp:read("*l")
     fp:close()
@@ -241,15 +244,15 @@ for opt,arg in likwid.getopt(arg, {"g:", "c:", "f:", "l", "p", "h", "v", "m", "h
     elseif (opt == "m") then
         printAvailGovs = true
     elseif opt == "?" then
-        print("Invalid commandline option -"..arg)
+        print_stderr("Invalid commandline option -"..arg)
         os.exit(1)
     elseif opt == "!" then
-        print("Option requires an argument")
+        print_stderr("Option requires an argument")
         os.exit(1)
     end
 end
 if not testDriver() then
-    print("The system does not use the acpi-cpufreq driver, other drivers are not usable with likwid-setFrequencies.")
+    print_stderr("The system does not use the acpi-cpufreq driver, other drivers are not usable with likwid-setFrequencies.")
     os.exit(1)
 end
 
@@ -268,23 +271,23 @@ end
 cpulist = {}
 numthreads, cpulist = likwid.cpustr_to_cpulist(domain)
 if verbosity == 3 then
-    print(string.format("Given CPU expression expands to %d CPU cores:", numthreads))
+    print_stdout(string.format("Given CPU expression expands to %d CPU cores:", numthreads))
     local str = tostring(cpulist[1])
     for i=2, numthreads  do
         str = str .. "," .. tostring(cpulist[i])
     end
-    print(str)
+    print_stdout(str)
 end
 
 
 if printAvailGovs then
     local govs = getAvailGovs(nil)
-    print("Available governors:")
-    print(table.concat(govs, ", "))
+    print_stdout("Available governors:")
+    print_stdout(table.concat(govs, ", "))
 end
 
 if printAvailFreq then
-    print("Available frequencies:")
+    print_stdout("Available frequencies:")
     local out = {}
     local i = 1;
     local freqs, turbo = getAvailFreq(nil)
@@ -295,16 +298,16 @@ if printAvailFreq then
         table.insert(out, freqs[i])
     end
 
-    print(table.concat(out, " "))
+    print_stdout(table.concat(out, " "))
 end
 
 if printCurFreq then
-    print("Current frequencies:")
+    print_stdout("Current frequencies:")
     local freqs = {}
     local govs = {}
     freqs, govs = getCurFreq()
     for i=1,#cpulist do
-        print(string.format("CPU %d: governor %12s frequency %5s GHz",cpulist[i],govs[cpulist[i]], freqs[cpulist[i]]))
+        print_stdout(string.format("CPU %d: governor %12s frequency %5s GHz",cpulist[i],govs[cpulist[i]], freqs[cpulist[i]]))
     end
 end
 
@@ -313,7 +316,7 @@ if printAvailGovs or printAvailFreq or printCurFreq then
 end
 
 if numthreads > 0 and not (frequency or governor) then
-    print("You need to set either a frequency or governor for the selected CPUs on commandline")
+    print_stderr("You need to set either a frequency or governor for the selected CPUs on commandline")
     os.exit(1)
 end
 
@@ -331,7 +334,7 @@ if frequency then
             valid_freq = true
         end
         if not valid_freq then
-            print(string.format("Frequency %s not available for CPU %d! Please select one of\n%s", frequency, cpulist[i], table.concat(freqs, ", ")))
+            print_stderr(string.format("Frequency %s not available for CPU %d! Please select one of\n%s", frequency, cpulist[i], table.concat(freqs, ", ")))
             os.exit(1)
         end
     
@@ -340,11 +343,11 @@ if frequency then
             cmd = cmd .. " " .. governor
         end
         if verbosity == 3 then
-            print("Execute: ".. cmd)
+            print_stdout("Execute: ".. cmd)
         end
         local err = os.execute(cmd)
         if err == false or err == nil then
-            print("Failed to set frequency for CPU "..tostring(cpulist[i]))
+            print_stderr("Failed to set frequency for CPU "..tostring(cpulist[i]))
         end
     end
     if governor then
@@ -370,7 +373,7 @@ if governor then
         end
     end
     if not valid_gov then
-        print(string.format("Governor %s not available! Please select one of\n%s", governor, table.concat(govs, ", ")))
+        print_stderr(string.format("Governor %s not available! Please select one of\n%s", governor, table.concat(govs, ", ")))
         os.exit(1)
     end
     for i=1,#cpulist do
@@ -382,11 +385,11 @@ if governor then
                 cmd = cmd .. tostring(tonumber(cur_freqs[cpulist[i]])*1E6) .. " " .. governor
             end
             if verbosity == 3 then
-                print("Execute: ".. cmd)
+                print_stdout("Execute: ".. cmd)
             end
             local err = os.execute(cmd)
             if err == false or err == nil then
-                print("Failed to set governor for CPU "..tostring(cpulist[i]))
+                print_stderr("Failed to set governor for CPU "..tostring(cpulist[i]))
             end
         end
     end
diff --git a/src/applications/likwid-topology.lua b/src/applications/likwid-topology.lua
index 0123f65..2aabf0c 100644
--- a/src/applications/likwid-topology.lua
+++ b/src/applications/likwid-topology.lua
@@ -8,7 +8,7 @@
  *                    on x86 processors.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
@@ -32,24 +32,26 @@
 package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
 
 local likwid = require("likwid")
-stdout_print = print
+
+print_stdout = print
+print_stderr = function(...) for k,v in pairs({...}) do io.stderr:write(v .. "\n") end end
 
 function version()
-    print(string.format("likwid-topology --  Version %d.%d",likwid.version,likwid.release))
+    io.stdout:write(string.format("likwid-topology --  Version %d.%d\n",likwid.version,likwid.release))
 end
 
 function usage()
     version()
-    print("A tool to print the thread and cache topology on x86 CPUs.\n")
-    print("Options:")
-    print("-h, --help\t\t Help message")
-    print("-v, --version\t\t Version information")
-    print("-V, --verbose <level>\t Set verbosity")
-    print("-c, --caches\t\t List cache information")
-    print("-C, --clock\t\t Measure processor clock")
-    print("-O\t\t\t CSV output")
-    print("-o, --output <file>\t Store output to file. (Optional: Apply text filter)")
-    print("-g\t\t\t Graphical output")
+    io.stdout:write("A tool to print the thread and cache topology on x86 CPUs.\n\n")
+    io.stdout:write("Options:\n")
+    io.stdout:write("-h, --help\t\t Help message\n")
+    io.stdout:write("-v, --version\t\t Version information\n")
+    io.stdout:write("-V, --verbose <level>\t Set verbosity\n")
+    io.stdout:write("-c, --caches\t\t List cache information\n")
+    io.stdout:write("-C, --clock\t\t Measure processor clock\n")
+    io.stdout:write("-O\t\t\t CSV output\n")
+    io.stdout:write("-o, --output <file>\t Store output to file. (Optional: Apply text filter)\n")
+    io.stdout:write("-g\t\t\t Graphical output\n")
 end
 
 print_caches = false
@@ -62,8 +64,8 @@ for opt,arg in likwid.getopt(arg, {"h","v","c","C","g","o:","V:","O","help","ver
     if (type(arg) == "string") then
         local s,e = arg:find("-");
         if s == 1 then
-            print(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
-            print("Did you forget an argument to an option?")
+            print_stderr(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print_stderr("Did you forget an argument to an option?")
             os.exit(1)
         end
     end
@@ -77,7 +79,7 @@ for opt,arg in likwid.getopt(arg, {"h","v","c","C","g","o:","V:","O","help","ver
         if tonumber(arg) >= 0 and tonumber(arg) <=3 then
             likwid.setVerbosity(tonumber(arg))
         else
-            print("Verbosity level not valid. Must be between 0 (only errors) and 3 (developer output)")
+            print_stderr("Verbosity level not valid. Must be between 0 (only errors) and 3 (developer output)")
         end
     elseif opt == "c" or opt == "caches" then
         print_caches = true
@@ -99,10 +101,10 @@ for opt,arg in likwid.getopt(arg, {"h","v","c","C","g","o:","V:","O","help","ver
         io.output(arg..".tmp")
         print = function(...) for k,v in pairs({...}) do io.write(v .. "\n") end end
     elseif opt == "?" then
-        print("Invalid commandline option -"..arg)
+        print_stderr("Invalid commandline option -"..arg)
         os.exit(1)
     elseif opt == "!" then
-        print("Option requires an argument")
+        print_stderr("Option requires an argument")
         os.exit(1)
     end
 end
@@ -143,7 +145,7 @@ table.insert(output_csv, "TABLE,Topology,"..tostring(cputopo["numHWThreads"]))
 table.insert(output_csv, "HWThread\tThread\t\tCore\t\tSocket\t\tAvailable")
 
 for cntr=0,cputopo["numHWThreads"]-1 do
-    if cputopo["threadPool"][cntr]["inCpuSet"] then
+    if cputopo["threadPool"][cntr]["inCpuSet"] == 1 then
         table.insert(output_csv, string.format("%d\t\t%u\t\t%u\t\t%u\t\t*",cntr,
                             cputopo["threadPool"][cntr]["threadId"],
                             cputopo["threadPool"][cntr]["coreId"],
@@ -295,12 +297,12 @@ end
 for _,line in pairs(output_csv) do print(line) end
 
 if print_graphical and not print_csv then
-    print("\n")
-    print(likwid.sline)
-    print("Graphical Topology")
-    print(likwid.sline)
+    print_stdout("\n")
+    print_stdout(likwid.sline)
+    print_stdout("Graphical Topology")
+    print_stdout(likwid.sline)
     for socket=0,cputopo["numSockets"]-1 do
-        print(string.format("Socket %d:",cputopo["topologyTree"][socket]["ID"]))
+        print_stdout(string.format("Socket %d:",cputopo["topologyTree"][socket]["ID"]))
         container = {}
         for core=0,cputopo["numCoresPerSocket"]-1 do
             local tmpString = ""
@@ -330,9 +332,9 @@ if print_graphical and not print_csv then
                     local tmpString = ""
                     local cacheWidth = 0
                     if cputopo["cacheLevels"][cache]["size"] < 1048576 then
-                        tmpString = string.format("%dkB", cputopo["cacheLevels"][cache]["size"]/1024)
+                        tmpString = string.format("%.0f kB", cputopo["cacheLevels"][cache]["size"]/1024)
                     else
-                        tmpString = string.format("%dMB", cputopo["cacheLevels"][cache]["size"]/1048576)
+                        tmpString = string.format("%.0f MB", cputopo["cacheLevels"][cache]["size"]/1048576)
                     end
                     if sharedCores > 1 then
                         if sharedCores > cputopo["numCoresPerSocket"] then
diff --git a/src/applications/likwid.lua b/src/applications/likwid.lua
index a6ffee3..b184b10 100644
--- a/src/applications/likwid.lua
+++ b/src/applications/likwid.lua
@@ -6,7 +6,7 @@
  *      Description:  Lua LIKWID interface library
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
@@ -63,6 +63,7 @@ likwid.getMetric = likwid_getMetric
 likwid.getLastMetric = likwid_getLastMetric
 likwid.getNumberOfGroups = likwid_getNumberOfGroups
 likwid.getRuntimeOfGroup = likwid_getRuntimeOfGroup
+likwid.getLastTimeOfGroup = likwid_getLastTimeOfGroup
 likwid.getIdOfActiveGroup = likwid_getIdOfActiveGroup
 likwid.getNumberOfEvents = likwid_getNumberOfEvents
 likwid.getNumberOfThreads = likwid_getNumberOfThreads
@@ -113,7 +114,7 @@ likwid.checkProgram = likwid_checkProgram
 likwid.killProgram = likwid_killProgram
 likwid.catchSignal = likwid_catchSignal
 likwid.getSignalState = likwid_getSignalState
-likwid.waitpid = likwid_waitwid
+likwid.waitpid = likwid_waitpid
 likwid.cpustr_to_cpulist = likwid_cpustr_to_cpulist
 likwid.nodestr_to_nodelist = likwid_nodestr_to_nodelist
 likwid.sockstr_to_socklist = likwid_sockstr_to_socklist
@@ -860,9 +861,9 @@ local function printOutput(results, metrics, cpulist, region, stats)
         if #cur_cpulist > 1 or stats == true then
             if use_csv then
                 if region == nil then
-                    print(string.format("TABLE,Group %d Raw Stat,%s,%d%s",g,groupName,#firsttab_combined[1]-1,string.rep(",",maxLineFields-4)))
+                    print(string.format("TABLE,Group %d Raw STAT,%s,%d%s",g,groupName,#firsttab_combined[1]-1,string.rep(",",maxLineFields-4)))
                 else
-                    print(string.format("TABLE,Region %s,Group %d Raw Stat,%s,%d%s",regionName, g,groupName,#firsttab_combined[1]-1,string.rep(",",maxLineFields-5)))
+                    print(string.format("TABLE,Region %s,Group %d Raw STAT,%s,%d%s",regionName, g,groupName,#firsttab_combined[1]-1,string.rep(",",maxLineFields-5)))
                 end
                 likwid.printcsv(firsttab_combined, maxLineFields)
             else
@@ -883,9 +884,9 @@ local function printOutput(results, metrics, cpulist, region, stats)
             if #cur_cpulist > 1 or stats == true then
                 if use_csv then
                     if region == nil then
-                        print(string.format("TABLE,Group %d Metric Stat,%s,%d%s",g,groupName,#secondtab_combined[1]-1,string.rep(",",maxLineFields-4)))
+                        print(string.format("TABLE,Group %d Metric STAT,%s,%d%s",g,groupName,#secondtab_combined[1]-1,string.rep(",",maxLineFields-4)))
                     else
-                        print(string.format("TABLE,Region %s,Group %d Metric Stat,%s,%d%s",regionName,g,groupName,#secondtab_combined[1]-1,string.rep(",",maxLineFields-5)))
+                        print(string.format("TABLE,Region %s,Group %d Metric STAT,%s,%d%s",regionName,g,groupName,#secondtab_combined[1]-1,string.rep(",",maxLineFields-5)))
                     end
                     likwid.printcsv(secondtab_combined, maxLineFields)
                 else
diff --git a/src/bitUtil.c b/src/bitUtil.c
index 099626c..6a3ddd5 100644
--- a/src/bitUtil.c
+++ b/src/bitUtil.c
@@ -6,7 +6,7 @@
  *      Description:  Utility routines manipulating bit arrays.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/calculator.c b/src/calculator.c
index bd73a4d..b46132e 100644
--- a/src/calculator.c
+++ b/src/calculator.c
@@ -6,21 +6,24 @@
  *      Description:  Infix calculator
  *
  *      Author:   Brandon Mills (bm), mills.brandont at gmail.com
- *      Project:  likwid
  *
  *      Copyright (C) 2016 Brandon Mills
  *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ *      software and associated documentation files (the "Software"), to deal in the
+ *      Softwarewithout restriction, including without limitation the rights to use, copy,
+ *      modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ *      and to permit persons to whom the Software is furnished to do so, subject to the
+ *      following conditions:
  *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *      The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
  *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ *      INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ *      PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *      HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *      OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ *      SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * =======================================================================================
  */
@@ -30,7 +33,7 @@
  *      Some changes done for the integration in LIKWID, see inline comments
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at gmail.com
@@ -481,7 +484,7 @@ int tokenize(char *str, char *(**tokensRef))
                             // Assemble rest of number
                             for(; // Don't change len
                                 *ptr // There is a next character and it is not null
-                                && len <= MAXTOKENLENGTH 
+                                && len <= MAXTOKENLENGTH
                                 && (type(*ptr) == digit // The next character is a digit
                                      || ((type(*ptr) == decimal // Or the next character is a decimal
                                          && hasDecimal == 0)) // But we have not added a decimal
@@ -544,11 +547,11 @@ int tokenize(char *str, char *(**tokensRef))
                     }
 
                     // Assemble rest of number
-                    /* Added support for signed exponents in scientific notation 
+                    /* Added support for signed exponents in scientific notation
                      * by Thomas Roehl (Thomas.Roehl at fau.de) as required for LIKWID */
                     for(; // Don't change len
                         *ptr // There is a next character and it is not null
-                        && len <= MAXTOKENLENGTH 
+                        && len <= MAXTOKENLENGTH
                         && (type(*ptr) == digit // The next character is a digit
                              || ((type(*ptr) == decimal // Or the next character is a decimal
                                  && hasDecimal == false)) // But we have not added a decimal
diff --git a/src/calculator_stack.c b/src/calculator_stack.c
index e14acee..43cae98 100644
--- a/src/calculator_stack.c
+++ b/src/calculator_stack.c
@@ -6,24 +6,27 @@
  *      Description:  Stack implementation for infix calculator
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Brandon Mills (bm), mills.brandont at gmail.com
- *      Project:  likwid
  *
- *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *      Copyright (C) 2016 Brandon Mills
  *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ *      software and associated documentation files (the "Software"), to deal in the
+ *      Softwarewithout restriction, including without limitation the rights to use, copy,
+ *      modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ *      and to permit persons to whom the Software is furnished to do so, subject to the
+ *      following conditions:
  *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *      The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
  *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ *      INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ *      PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *      HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *      OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ *      SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * =======================================================================================
  */
diff --git a/src/configuration.c b/src/configuration.c
index f7a9357..b60dcaf 100644
--- a/src/configuration.c
+++ b/src/configuration.c
@@ -6,7 +6,7 @@
  *      Description:  Configuration file module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/cpuFeatures.c b/src/cpuFeatures.c
index e3ecfdc..1c866ff 100644
--- a/src/cpuFeatures.c
+++ b/src/cpuFeatures.c
@@ -10,7 +10,7 @@
  *                  available.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -52,7 +52,6 @@ static int features_initialized = 0;
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 
-
 #define PRINT_VALUE(color,string)  \
     color_on(BRIGHT,(color));      \
     printf(#string"\n");            \
@@ -80,9 +79,10 @@ static int features_initialized = 0;
 
 #define IF_FLAG(feature) (cpuFeatureMask[cpu] & (1ULL<<feature))
 
-
 /* #####   FUNCTIONS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-static void cpuFeatures_update(int cpu)
+
+static void
+cpuFeatures_update(int cpu)
 {
     int ret;
     uint64_t flags = 0x0ULL;
@@ -146,6 +146,7 @@ static void cpuFeatures_update(int cpu)
              (cpuid_info.model == ATOM_SILVERMONT_Z2) ||
              (cpuid_info.model == ATOM_SILVERMONT_F) ||
              (cpuid_info.model == ATOM_SILVERMONT_AIR) ||
+             (cpuid_info.model == ATOM_SILVERMONT_GOLD) ||
              (cpuid_info.model == SANDYBRIDGE) ||
              (cpuid_info.model == SANDYBRIDGE_EP) ||
              (cpuid_info.model == IVYBRIDGE) ||
@@ -182,12 +183,15 @@ static void cpuFeatures_update(int cpu)
             (cpuid_info.model == BROADWELL_D) ||
             (cpuid_info.model == BROADWELL_E) ||
             (cpuid_info.model == SKYLAKE1) ||
-            (cpuid_info.model == SKYLAKE2))
+            (cpuid_info.model == SKYLAKE2) ||
+            (cpuid_info.model == ATOM_SILVERMONT_GOLD))
     {
         ret = HPMread(cpu, MSR_DEV, MSR_PREFETCH_ENABLE, &flags);
         if (ret != 0)
         {
-            fprintf(stderr, "Cannot read register 0x%X on cpu %d: err %d\n", MSR_PREFETCH_ENABLE, cpu, ret);
+            fprintf(stderr,
+                    "Cannot read register 0x%X on cpu %d: err %d\n",
+                    MSR_PREFETCH_ENABLE, cpu, ret);
         }
         TEST_FLAG_INV(FEAT_IP_PREFETCHER,3);
         TEST_FLAG_INV(FEAT_DCU_PREFETCHER,2);
@@ -196,7 +200,8 @@ static void cpuFeatures_update(int cpu)
     }
 }
 
-static char* cpuFeatureNames[CPUFEATURES_MAX] = {
+static char*
+cpuFeatureNames[CPUFEATURES_MAX] = {
     [FEAT_HW_PREFETCHER] = "Hardware Prefetcher",
     [FEAT_IP_PREFETCHER] = "IP Prefetcher",
     [FEAT_DCU_PREFETCHER] = "DCU Pretecher",
@@ -223,7 +228,6 @@ static char* cpuFeatureNames[CPUFEATURES_MAX] = {
 void
 cpuFeatures_init()
 {
-    int i;
     if (features_initialized)
     {
         return;
@@ -233,22 +237,25 @@ cpuFeatures_init()
     if (!HPMinitialized())
     {
         HPMinit();
-        
-    }
-    for (i = 0; i < cpuid_topology.numHWThreads; i++)
-    {
-        HPMaddThread(cpuid_topology.threadPool[i].apicId);
-        cpuFeatures_update(cpuid_topology.threadPool[i].apicId);
+
+        for (int i = 0; i < cpuid_topology.numHWThreads; i++)
+        {
+            int ret = HPMaddThread(cpuid_topology.threadPool[i].apicId);
+            if (ret != 0)
+            {
+                ERROR_PRINT(Cannot get access to register CPU feature register on CPU %d, cpuid_topology.threadPool[i].apicId);
+                return;
+            }
+            cpuFeatures_update(cpuid_topology.threadPool[i].apicId);
+        }
     }
 
-    
     features_initialized = 1;
 }
 
 void
 cpuFeatures_print(int cpu)
 {
-    int i;
     uint64_t flags = 0x0ULL;
     if (!features_initialized)
     {
@@ -257,7 +264,7 @@ cpuFeatures_print(int cpu)
     cpuFeatures_update(cpu);
 
     printf(HLINE);
-    for (i=0;i<CPUFEATURES_MAX; i++)
+    for (int i=0; i<CPUFEATURES_MAX; i++)
     {
         if ((cpuid_info.model != CORE2_45) &&
             (cpuid_info.model != CORE2_65) &&
@@ -311,11 +318,13 @@ cpuFeatures_enable(int cpu, CpuFeature type, int print)
             (cpuid_info.model == BROADWELL_D) ||
             (cpuid_info.model == BROADWELL_E) ||
             (cpuid_info.model == SKYLAKE1) ||
-            (cpuid_info.model == SKYLAKE2))
+            (cpuid_info.model == SKYLAKE2) ||
+            (cpuid_info.model == ATOM_SILVERMONT_GOLD))
     {
         reg = MSR_PREFETCH_ENABLE;
         newOffsets = 1;
     }
+
     ret = HPMread(cpu, MSR_DEV, reg, &flags);
     if (ret != 0)
     {
@@ -406,7 +415,6 @@ cpuFeatures_enable(int cpu, CpuFeature type, int print)
     return 0;
 }
 
-
 int
 cpuFeatures_disable(int cpu, CpuFeature type, int print)
 {
@@ -437,7 +445,8 @@ cpuFeatures_disable(int cpu, CpuFeature type, int print)
             (cpuid_info.model == BROADWELL_D) ||
             (cpuid_info.model == BROADWELL_E) ||
             (cpuid_info.model == SKYLAKE1) ||
-            (cpuid_info.model == SKYLAKE2))
+            (cpuid_info.model == SKYLAKE2) ||
+            (cpuid_info.model == ATOM_SILVERMONT_GOLD))
     {
         reg = MSR_PREFETCH_ENABLE;
         newOffsets = 1;
@@ -534,7 +543,8 @@ cpuFeatures_disable(int cpu, CpuFeature type, int print)
     return ret;
 }
 
-int cpuFeatures_get(int cpu, CpuFeature type)
+int
+cpuFeatures_get(int cpu, CpuFeature type)
 {
     if ((type >= FEAT_HW_PREFETCHER) && (type < CPUFEATURES_MAX))
     {
@@ -550,7 +560,8 @@ int cpuFeatures_get(int cpu, CpuFeature type)
     return -EINVAL;
 }
 
-char* cpuFeatures_name(CpuFeature type)
+char*
+cpuFeatures_name(CpuFeature type)
 {
     if ((type >= FEAT_HW_PREFETCHER) && (type < CPUFEATURES_MAX))
     {
@@ -558,3 +569,4 @@ char* cpuFeatures_name(CpuFeature type)
     }
     return NULL;
 }
+
diff --git a/src/cpustring.c b/src/cpustring.c
index 7b57ed0..ed934ac 100644
--- a/src/cpustring.c
+++ b/src/cpustring.c
@@ -6,7 +6,7 @@
  *      Description:  Parser for CPU selection strings
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
@@ -30,6 +30,7 @@
 
 #include <stdlib.h>
 #include <stdio.h>
+#include <math.h>
 
 #include <likwid.h>
 
@@ -43,13 +44,16 @@ static int cpulist_sort(int* incpus, int* outcpus, int length)
     {
         return -1;
     }
+    int inner_loop = ceil((double)length/cpuid_topology->numThreadsPerCore);
     for (int off=0;off < cpuid_topology->numThreadsPerCore;off++)
     {
-        for (int i=0; i<length/cpuid_topology->numThreadsPerCore;i++)
+        for (int i=0; i<inner_loop;i++)
         {
             outcpus[insert] = incpus[(i*cpuid_topology->numThreadsPerCore)+off];
             insert++;
         }
+        if (insert == length)
+            break;
     }
     return insert;
 }
@@ -336,7 +340,7 @@ static int cpustr_to_cpulist_logical(bstring bcpustr, int* cpulist, int length)
         }
         else
         {
-            cpulist[insert] = inlist[atoi(bdata(strlist->entry[i]))];
+            cpulist[insert] = inlist[atoi(bdata(strlist->entry[i])) % ret];
             insert++;
             if (insert == length)
             {
diff --git a/src/hashTable.c b/src/hashTable.c
index 46c0c66..d3bc9ab 100644
--- a/src/hashTable.c
+++ b/src/hashTable.c
@@ -7,7 +7,7 @@
  *                   Used for Marker API result handling.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/access.h b/src/includes/access.h
index b81beb8..c7f95d3 100644
--- a/src/includes/access.h
+++ b/src/includes/access.h
@@ -6,7 +6,7 @@
  *      Description:  Header File HPM access Module
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
diff --git a/src/includes/access_client.h b/src/includes/access_client.h
index 46f1dbb..ecfdb31 100644
--- a/src/includes/access_client.h
+++ b/src/includes/access_client.h
@@ -1,3 +1,32 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  access_client.h
+ *
+ *      Description:  Header file for interface to the access daemon for the access module.
+ *
+ *      Version:   4.1
+ *      Released:  8.8.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
 #ifndef LIKWID_ACCESS_CLIENT_H
 #define LIKWID_ACCESS_CLIENT_H
 
diff --git a/src/includes/access_client_types.h b/src/includes/access_client_types.h
index 214aae8..1eb16a9 100644
--- a/src/includes/access_client_types.h
+++ b/src/includes/access_client_types.h
@@ -6,7 +6,7 @@
  *      Description:  Types file for access_client access module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/access_x86.h b/src/includes/access_x86.h
index 1628bee..dbfda7f 100644
--- a/src/includes/access_x86.h
+++ b/src/includes/access_x86.h
@@ -1,3 +1,32 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  access_x86.h
+ *
+ *      Description:  Header file for the interface to x86 related functions for the access module.
+ *
+ *      Version:   4.1
+ *      Released:  8.8.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
 #ifndef LIKWID_ACCESS_X86_H
 #define LIKWID_ACCESS_X86_H
 
diff --git a/src/includes/access_x86_msr.h b/src/includes/access_x86_msr.h
index a00c45b..87d3500 100644
--- a/src/includes/access_x86_msr.h
+++ b/src/includes/access_x86_msr.h
@@ -1,3 +1,32 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  access_x86_msr.h
+ *
+ *      Description:  Header file for the interface to x86 MSR functions for the access module.
+ *
+ *      Version:   4.1
+ *      Released:  8.8.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
 #ifndef LIKWID_ACCESS_X86_MSR_H
 #define LIKWID_ACCESS_X86_MSR_H
 
diff --git a/src/includes/access_x86_pci.h b/src/includes/access_x86_pci.h
index e932e57..241a1e8 100644
--- a/src/includes/access_x86_pci.h
+++ b/src/includes/access_x86_pci.h
@@ -1,3 +1,32 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  access_x86_pci.h
+ *
+ *      Description:  Header file for the interface to x86 PCI functions for the access module.
+ *
+ *      Version:   4.1
+ *      Released:  8.8.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
 #ifndef LIKWID_ACCESS_X86_PCI_H
 #define LIKWID_ACCESS_X86_PCI_H
 
diff --git a/src/includes/affinity.h b/src/includes/affinity.h
index 6f2215c..3692976 100644
--- a/src/includes/affinity.h
+++ b/src/includes/affinity.h
@@ -6,7 +6,7 @@
  *      Description:  Header File affinity Module
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at gmail.com
diff --git a/src/includes/bitUtil.h b/src/includes/bitUtil.h
index e10ad65..b0a17ab 100644
--- a/src/includes/bitUtil.h
+++ b/src/includes/bitUtil.h
@@ -7,7 +7,7 @@
  *                    Helper routines for dealing with bit manipulations
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/calculator.h b/src/includes/calculator.h
index 67ca564..819041a 100644
--- a/src/includes/calculator.h
+++ b/src/includes/calculator.h
@@ -1,30 +1,29 @@
 /*
  * =======================================================================================
  *
- *      Filename:  calculator.h
+ *      Filename:  calculator.c
  *
- *      Description:  Header file for infix calculator
+ *      Description:  Infix calculator
  *
- *      Version:   4.1
- *      Released:  19.5.2016
+ *      Author:   Brandon Mills (bm), mills.brandont at gmail.com
  *
- *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
- *                Thomas Roehl (tr), thomas.roehl at gmail.com
- *      Project:  likwid
+ *      Copyright (C) 2016 Brandon Mills
  *
- *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ *      software and associated documentation files (the "Software"), to deal in the
+ *      Softwarewithout restriction, including without limitation the rights to use, copy,
+ *      modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ *      and to permit persons to whom the Software is furnished to do so, subject to the
+ *      following conditions:
  *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
+ *      The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
  *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ *      INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ *      PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *      HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *      OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ *      SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * =======================================================================================
  */
diff --git a/src/includes/calculator_stack.h b/src/includes/calculator_stack.h
index 670f317..1c1ab90 100644
--- a/src/includes/calculator_stack.h
+++ b/src/includes/calculator_stack.h
@@ -1,29 +1,29 @@
 /*
  * =======================================================================================
  *
- *      Filename:  calculator_stack.h
+ *      Filename:  calculator.c
  *
- *      Description:  Stack implementation for infix calculator
- *
- *      Version:   4.1
- *      Released:  19.5.2016
+ *      Description:  Infix calculator
  *
  *      Author:   Brandon Mills (bm), mills.brandont at gmail.com
- *      Project:  likwid
  *
- *      Copyright (C) Brandon Mills
+ *      Copyright (C) 2016 Brandon Mills
  *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ *      software and associated documentation files (the "Software"), to deal in the
+ *      Softwarewithout restriction, including without limitation the rights to use, copy,
+ *      modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ *      and to permit persons to whom the Software is furnished to do so, subject to the
+ *      following conditions:
  *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *      The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
  *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ *      INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ *      PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *      HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *      OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ *      SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * =======================================================================================
  */
diff --git a/src/includes/configuration.h b/src/includes/configuration.h
index a6a3334..21b97a5 100644
--- a/src/includes/configuration.h
+++ b/src/includes/configuration.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of Module configuration.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
diff --git a/src/includes/cpuFeatures.h b/src/includes/cpuFeatures.h
index af4d7c2..41c45e4 100644
--- a/src/includes/cpuFeatures.h
+++ b/src/includes/cpuFeatures.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of Module cpuFeatures.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/cpuFeatures_types.h b/src/includes/cpuFeatures_types.h
index 87ed2a2..ec5e9c7 100644
--- a/src/includes/cpuFeatures_types.h
+++ b/src/includes/cpuFeatures_types.h
@@ -6,7 +6,7 @@
  *      Description:  Types file for CpuFeature module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/cpuid.h b/src/includes/cpuid.h
index 7970ced..7cea6d8 100644
--- a/src/includes/cpuid.h
+++ b/src/includes/cpuid.h
@@ -1,12 +1,12 @@
 /*
  * =======================================================================================
  *
- *      Filename:  configuration.h
+ *      Filename:  cpuid.h
  *
  *      Description:  Common macro definition for CPUID instruction
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
diff --git a/src/includes/error.h b/src/includes/error.h
index faabb2e..696db4d 100644
--- a/src/includes/error.h
+++ b/src/includes/error.h
@@ -6,7 +6,7 @@
  *      Description:  Central error handling macros
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at gmail.com
diff --git a/src/includes/hashTable.h b/src/includes/hashTable.h
index 4da4cbf..9824e1d 100644
--- a/src/includes/hashTable.h
+++ b/src/includes/hashTable.h
@@ -8,7 +8,7 @@
  *                    specific region information.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/libperfctr_types.h b/src/includes/libperfctr_types.h
index 6e375b6..7cf836e 100644
--- a/src/includes/libperfctr_types.h
+++ b/src/includes/libperfctr_types.h
@@ -6,7 +6,7 @@
  *      Description:  Types file for libperfctr module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/likwid.h b/src/includes/likwid.h
index d900a0d..d2ec5e9 100644
--- a/src/includes/likwid.h
+++ b/src/includes/likwid.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of likwid API
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Authors:  Thomas Roehl (tr), thomas.roehl at googlemail.com
  *
@@ -674,6 +674,7 @@ extern int sockstr_to_socklist(char* sockstr, int* sockets, int length)  __attri
 /** \addtogroup PerfMon Performance monitoring module
  *  @{
  */
+
 /*! \brief Get all groups
 
 Checks the configured performance group path for the current architecture and
@@ -770,7 +771,8 @@ extern int perfmon_readGroupCounters(int groupId) __attribute__ ((visibility ("d
 Read the counters that have been previously started by perfmon_startCounters().
 The counters are stopped directly to avoid interference of LIKWID with the measured
 code. Before returning, the counters are started again. Only one thread's CPU is read.
- at param [in] groupId Read the counters for on thread taking part in group
+ at param [in] groupId Read the counters defined in group identified with groupId
+ at param [in] threadId Read the counters for the thread
 @return 0 on success and -(thread_id+1) for error
 */
 extern int perfmon_readGroupThreadCounters(int groupId, int threadId) __attribute__ ((visibility ("default") ));
@@ -1346,7 +1348,8 @@ extern void memsweep_threadGroup(int* processorList, int numberOfProcessors) __a
 /** \addtogroup CpuFeatures Retrieval and manipulation of processor features
  *  @{
  */
-
+/*! \brief Enumeration of all CPU related features.
+*/
 typedef enum {
     FEAT_HW_PREFETCHER=0, /*!< \brief Hardware prefetcher */
     FEAT_CL_PREFETCHER, /*!< \brief Adjacent cache line prefetcher */
@@ -1401,6 +1404,7 @@ extern char* cpuFeatures_name(CpuFeature type)  __attribute__ ((visibility ("def
 Enable a CPU feature for a specific CPU. Only the state of the prefetchers can be changed, all other features return -EINVAL
 @param [in] cpu CPU ID
 @param [in] type CPU feature
+ at param [in] print Print outcome of operation
 @return Status of operation (0=success, all others are erros, either by MSR access or invalid feature)
 */
 extern int cpuFeatures_enable(int cpu, CpuFeature type, int print) __attribute__ ((visibility ("default") ));
@@ -1409,6 +1413,7 @@ extern int cpuFeatures_enable(int cpu, CpuFeature type, int print) __attribute__
 Disable a CPU feature for a specific CPU. Only the state of the prefetchers can be changed, all other features return -EINVAL
 @param [in] cpu CPU ID
 @param [in] type CPU feature
+ at param [in] print Print outcome of operation
 @return Status of operation (0=success, all others are erros, either by MSR access or invalid feature)
 */
 extern int cpuFeatures_disable(int cpu, CpuFeature type, int print) __attribute__ ((visibility ("default") ));
diff --git a/src/includes/lock.h b/src/includes/lock.h
index 93f3d9b..623e81c 100644
--- a/src/includes/lock.h
+++ b/src/includes/lock.h
@@ -6,7 +6,7 @@
  *      Description:  Header File Locking primitive Module
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/memsweep.h b/src/includes/memsweep.h
index de7a7b0..dca6862 100644
--- a/src/includes/memsweep.h
+++ b/src/includes/memsweep.h
@@ -7,7 +7,7 @@
  *                    defined in likwid.h
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/numa.h b/src/includes/numa.h
index 3ca582f..681894c 100644
--- a/src/includes/numa.h
+++ b/src/includes/numa.h
@@ -7,7 +7,7 @@
  *                    defined in likwid.h
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/includes/numa_hwloc.h b/src/includes/numa_hwloc.h
index cf74238..33af62d 100644
--- a/src/includes/numa_hwloc.h
+++ b/src/includes/numa_hwloc.h
@@ -6,7 +6,7 @@
  *      Description:  Header File hwloc NUMA backend
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/includes/numa_proc.h b/src/includes/numa_proc.h
index 71af378..24d39e7 100644
--- a/src/includes/numa_proc.h
+++ b/src/includes/numa_proc.h
@@ -6,7 +6,7 @@
  *      Description:  Header File procfs/sysfs NUMA backend
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/includes/pci_hwloc.h b/src/includes/pci_hwloc.h
index fd7db29..9533b49 100644
--- a/src/includes/pci_hwloc.h
+++ b/src/includes/pci_hwloc.h
@@ -6,7 +6,7 @@
  *      Description:  Header File hwloc based PCI lookup backend
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/includes/pci_proc.h b/src/includes/pci_proc.h
index 062daa9..3aa859c 100644
--- a/src/includes/pci_proc.h
+++ b/src/includes/pci_proc.h
@@ -6,7 +6,7 @@
  *      Description:  Header File procfs based PCI lookup backend
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/includes/pci_types.h b/src/includes/pci_types.h
index 7e8495b..651409a 100644
--- a/src/includes/pci_types.h
+++ b/src/includes/pci_types.h
@@ -6,7 +6,7 @@
  *      Description:  Types file for pci module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfgroup.h b/src/includes/perfgroup.h
index c4f25ec..78c5453 100644
--- a/src/includes/perfgroup.h
+++ b/src/includes/perfgroup.h
@@ -1,12 +1,12 @@
 /*
  * =======================================================================================
  *
- *      Filename:  configuration.h
+ *      Filename:  perfgroup.h
  *
  *      Description:  Header File of performance group and event set handler
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon.h b/src/includes/perfmon.h
index 37058c1..0c39093 100644
--- a/src/includes/perfmon.h
+++ b/src/includes/perfmon.h
@@ -8,7 +8,7 @@
  *                    on x86 based architectures. Supports multi threading.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_atom.h b/src/includes/perfmon_atom.h
index 73cc9f9..e96c8e0 100644
--- a/src/includes/perfmon_atom.h
+++ b/src/includes/perfmon_atom.h
@@ -6,7 +6,7 @@
  *      Description:  Header file of perfmon module for Atom
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_atom_events.txt b/src/includes/perfmon_atom_events.txt
index cb4e2fc..26ec5b9 100644
--- a/src/includes/perfmon_atom_events.txt
+++ b/src/includes/perfmon_atom_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Atom
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_broadwell.h b/src/includes/perfmon_broadwell.h
index 8e5fc2a..b22ed0c 100644
--- a/src/includes/perfmon_broadwell.h
+++ b/src/includes/perfmon_broadwell.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of perfmon module for Intel Broadwell.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
@@ -305,7 +305,7 @@ int bdwep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
                     filter_flags1 |= (extractBitField(event->options[j].value,16,0));
                     break;
                 case EVENT_OPTION_STATE:
-                    filter_flags0 |= (extractBitField(event->options[j].value,6,0) << 17);
+                    filter_flags0 |= (extractBitField(event->options[j].value,7,0) << 17);
                     set_state_all = 0;
                     break;
                 case EVENT_OPTION_TID:
@@ -320,7 +320,7 @@ int bdwep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
             }
         }
     }
-    
+
     if (filter_flags0 != 0x0ULL)
     {
         VERBOSEPRINTREG(cpu_id, filter0, filter_flags0, SETUP_CBOX_FILTER0);
@@ -1134,6 +1134,14 @@ int perfmon_setupCounterThread_broadwell(
                 break;
         }
     }
+    for (int i=UNCORE;i<NUM_UNITS;i++)
+    {
+        if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(i))) && box_map[i].ctrlRegister != 0x0)
+        {
+            VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL, CLEAR_UNCORE_BOX_CTRL);
+            HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+        }
+    }
     if (fixed_flags > 0x0ULL)
     {
         VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
@@ -1470,44 +1478,45 @@ int perfmon_stopCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSe
 
                 case QBOX0FIX:
                 case QBOX1FIX:
-                case QBOX2FIX:
-                    if (eventSet->events[i].event.eventId == 0x00)
+                    if (haveLock)
                     {
                         HPMread(cpu_id, dev, counter1, &counter_result);
-                        switch(extractBitField(counter_result, 3, 0))
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_QBOXFIX)
+                        eventSet->events[i].threadCounter[thread_id].startData = 0;
+                        if (eventSet->events[i].event.eventId == 0x00)
                         {
-                            case 0x2:
-                                counter_result = 5.6E9;
-                                break;
-                            case 0x3:
-                                counter_result = 6.4E9;
-                                break;
-                            case 0x4:
-                                counter_result = 7.2E9;
-                                break;
-                            case 0x5:
-                                counter_result = 8.0E9;
-                                break;
-                            case 0x6:
-                                counter_result = 8.8E9;
-                                break;
-                            case 0x7:
-                                counter_result = 9.6E9;
-                                break;
-                            default:
-                                counter_result = 0;
-                                break;
+                            switch(extractBitField(counter_result, 3, 0))
+                            {
+                                case 0x2:
+                                    counter_result = 5.6E9;
+                                    break;
+                                case 0x3:
+                                    counter_result = 6.4E9;
+                                    break;
+                                case 0x4:
+                                    counter_result = 7.2E9;
+                                    break;
+                                case 0x5:
+                                    counter_result = 8.0E9;
+                                    break;
+                                case 0x6:
+                                    counter_result = 8.8E9;
+                                    break;
+                                case 0x7:
+                                    counter_result = 9.6E9;
+                                    break;
+                                default:
+                                    counter_result = 0;
+                                    break;
+                            }
+
+                        }
+                        else if ((eventSet->events[i].event.eventId == 0x01) ||
+                                 (eventSet->events[i].event.eventId == 0x02))
+                        {
+                            counter_result = field64(counter_result, 0, box_map[type].regWidth);
                         }
-                        
-                    }
-                    else if ((eventSet->events[i].event.eventId == 0x01) ||
-                             (eventSet->events[i].event.eventId == 0x02))
-                    {
-                        HPMread(cpu_id, dev, counter1, &counter_result);
-                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_QBOXFIX);
-                        counter_result = field64(counter_result, 0, box_map[type].regWidth);
                     }
-                    eventSet->events[i].threadCounter[thread_id].counterData = counter_result;
                     break;
 
                 default:
@@ -1515,7 +1524,6 @@ int perfmon_stopCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSe
             }
             *current = field64(counter_result, 0, box_map[type].regWidth);
         }
-        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
 
 
@@ -1662,43 +1670,46 @@ int perfmon_readCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSe
 
                 case QBOX0FIX:
                 case QBOX1FIX:
-                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_QBOXFIX)
-                    if (eventSet->events[i].event.eventId == 0x00)
+                    if (haveLock)
                     {
-                        HPMread(cpu_id, dev, counter1, &counter_result);
-                        switch(extractBitField(counter_result, 3, 0))
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_QBOXFIX)
+                        if (eventSet->events[i].event.eventId == 0x00)
                         {
-                            case 0x2:
-                                counter_result = 5.6E9;
-                                break;
-                            case 0x3:
-                                counter_result = 6.4E9;
-                                break;
-                            case 0x4:
-                                counter_result = 7.2E9;
-                                break;
-                            case 0x5:
-                                counter_result = 8.0E9;
-                                break;
-                            case 0x6:
-                                counter_result = 8.8E9;
-                                break;
-                            case 0x7:
-                                counter_result = 9.6E9;
-                                break;
-                            default:
-                                counter_result = 0;
-                                break;
+                            HPMread(cpu_id, dev, counter1, &counter_result);
+                            switch(extractBitField(counter_result, 3, 0))
+                            {
+                                case 0x2:
+                                    counter_result = 5.6E9;
+                                    break;
+                                case 0x3:
+                                    counter_result = 6.4E9;
+                                    break;
+                                case 0x4:
+                                    counter_result = 7.2E9;
+                                    break;
+                                case 0x5:
+                                    counter_result = 8.0E9;
+                                    break;
+                                case 0x6:
+                                    counter_result = 8.8E9;
+                                    break;
+                                case 0x7:
+                                    counter_result = 9.6E9;
+                                    break;
+                                default:
+                                    counter_result = 0;
+                                    break;
+                            }
+
                         }
-                        
-                    }
-                    else if ((eventSet->events[i].event.eventId == 0x01) ||
-                             (eventSet->events[i].event.eventId == 0x02))
-                    {
-                        HPMread(cpu_id, dev, counter1, &counter_result);
-                        counter_result = field64(counter_result, 0, box_map[type].regWidth);
+                        else if ((eventSet->events[i].event.eventId == 0x01) ||
+                                 (eventSet->events[i].event.eventId == 0x02))
+                        {
+                            HPMread(cpu_id, dev, counter1, &counter_result);
+                            counter_result = field64(counter_result, 0, box_map[type].regWidth);
+                        }
+                        *current = counter_result;
                     }
-                    *current = counter_result;
                     break;
 
                 default:
@@ -1770,6 +1781,13 @@ int perfmon_finalizeCountersThread_broadwell(int thread_id, PerfmonEventSet* eve
             ovf_values_uncore = 0x0ULL;
             VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
             CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
+            if (counter_map[index].counterRegister2 != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL));
+            }
         }
         eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
@@ -1780,6 +1798,24 @@ int perfmon_finalizeCountersThread_broadwell(int thread_id, PerfmonEventSet* eve
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, ovf_values_uncore));
         VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UNCORE_CTRL)
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, 0x0ULL));
+        for (int i=UNCORE;i<NUM_UNITS;i++)
+        {
+            if ((eventSet->regTypeMask & (REG_TYPE_MASK(i))) && box_map[i].ctrlRegister != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL, CLEAR_UNCORE_BOX_CTRL);
+                HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+                if (box_map[i].filterRegister1)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL);
+                }
+                if (box_map[i].filterRegister2)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL);
+                }
+            }
+        }
     }
 
     if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
diff --git a/src/includes/perfmon_broadwellEP_counters.h b/src/includes/perfmon_broadwellEP_counters.h
index d37c871..6c693e9 100644
--- a/src/includes/perfmon_broadwellEP_counters.h
+++ b/src/includes/perfmon_broadwellEP_counters.h
@@ -6,7 +6,7 @@
  *      Description:  Counter Header File of perfmon module for Broadwell EP/EN/EX.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
@@ -329,8 +329,9 @@ static BoxMap broadwellEP_box_map[NUM_UNITS] = {
     [QBOX0] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 25, 1, PCI_QPI_DEVICE_PORT_0, 48},
     [QBOX1] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 26, 1, PCI_QPI_DEVICE_PORT_1, 48},
     [QBOX2] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, -1, 1, PCI_QPI_DEVICE_PORT_2, 48},
-    [QBOX0FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_0, 32},
-    [QBOX1FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_1, 32},
+    [QBOX0FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_0, 64},
+    [QBOX1FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_1, 64},
+    [QBOX2FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_2, 64},
 };
 
 static PciDevice broadwellEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
diff --git a/src/includes/perfmon_broadwellEP_events.txt b/src/includes/perfmon_broadwellEP_events.txt
index 0781ebe..c03e9ba 100644
--- a/src/includes/perfmon_broadwellEP_events.txt
+++ b/src/includes/perfmon_broadwellEP_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Broadwell EP/EN/EX.
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
@@ -2559,11 +2559,11 @@ UMASK_VNA_CREDIT_RETURNS                0x00 0x01 0x00
 EVENT_VNA_CREDIT_RETURN_OCCUPANCY       0x1B QBOX
 UMASK_VNA_CREDIT_RETURN_OCCUPANCY       0x00 0x01 0x00
 
-EVENT_QPI_RATE                          0x00 QBOX0FIX0|QBOX1FIX0
+EVENT_QPI_RATE                          0x00 QBOX0FIX0|QBOX1FIX0|QBOX2FIX0
 UMASK_QPI_RATE                          0x00
 
-EVENT_QPI_RX_IDLE                       0x01 QBOX0FIX1|QBOX1FIX1
+EVENT_QPI_RX_IDLE                       0x01 QBOX0FIX1|QBOX1FIX1|QBOX2FIX1
 UMASK_QPI_RX_IDLE                       0x00
 
-EVENT_QPI_RX_LLR                        0x02 QBOX0FIX2|QBOX1FIX2
+EVENT_QPI_RX_LLR                        0x02 QBOX0FIX2|QBOX1FIX2|QBOX2FIX2
 UMASK_QPI_RX_LLR                        0x00
diff --git a/src/includes/perfmon_broadwell_counters.h b/src/includes/perfmon_broadwell_counters.h
index d5608ba..362e9de 100644
--- a/src/includes/perfmon_broadwell_counters.h
+++ b/src/includes/perfmon_broadwell_counters.h
@@ -6,7 +6,7 @@
  *      Description:  Counter Header File of perfmon module for Broadwell.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_broadwell_events.txt b/src/includes/perfmon_broadwell_events.txt
index 023bc01..548b355 100644
--- a/src/includes/perfmon_broadwell_events.txt
+++ b/src/includes/perfmon_broadwell_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Broadwell
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_broadwelld_counters.h b/src/includes/perfmon_broadwelld_counters.h
index 37f70ad..c195ff2 100644
--- a/src/includes/perfmon_broadwelld_counters.h
+++ b/src/includes/perfmon_broadwelld_counters.h
@@ -1,12 +1,12 @@
 /*
  * =======================================================================================
  *
- *      Filename:  perfmon_broadwellD_counters.h
+ *      Filename:  perfmon_broadwelld_counters.h
  *
  *      Description:  Counter Header File of perfmon module for Broadwell D.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_broadwelld_events.txt b/src/includes/perfmon_broadwelld_events.txt
index 88c5add..e52f292 100644
--- a/src/includes/perfmon_broadwelld_events.txt
+++ b/src/includes/perfmon_broadwelld_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Broadwell D
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_core2.h b/src/includes/perfmon_core2.h
index 9c4ba1d..ec3f0af 100644
--- a/src/includes/perfmon_core2.h
+++ b/src/includes/perfmon_core2.h
@@ -6,7 +6,7 @@
  *      Description:  Header file of perfmon module for Intel Core 2
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -326,6 +326,8 @@ int perfmon_finalizeCountersThread_core2(int thread_id, PerfmonEventSet* eventSe
         {
             VERBOSEPRINTREG(cpu_id, reg, LLU_CAST 0x0ULL, CLEAR_CTRL);
             CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+            VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST 0x0ULL, CLEAR_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].counterRegister, 0x0ULL));
         }
     }
     VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
diff --git a/src/includes/perfmon_core2_counters.h b/src/includes/perfmon_core2_counters.h
index 2dada93..e3ae594 100644
--- a/src/includes/perfmon_core2_counters.h
+++ b/src/includes/perfmon_core2_counters.h
@@ -6,7 +6,7 @@
  *      Description:  Counter header file of perfmon module for Intel Core 2
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_core2_events.txt b/src/includes/perfmon_core2_events.txt
index ebb2dc5..93ad0b7 100644
--- a/src/includes/perfmon_core2_events.txt
+++ b/src/includes/perfmon_core2_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Core 2
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_skylake.h b/src/includes/perfmon_goldmont.h
similarity index 66%
copy from src/includes/perfmon_skylake.h
copy to src/includes/perfmon_goldmont.h
index 1a10dc4..14270fe 100644
--- a/src/includes/perfmon_skylake.h
+++ b/src/includes/perfmon_goldmont.h
@@ -1,12 +1,12 @@
 /*
  * =======================================================================================
  *
- *      Filename:  perfmon_skylake.h
+ *      Filename:  perfmon_goldmont.h
  *
- *      Description:  Header File of perfmon module for Intel Skylake.
+ *      Description:  Header File of perfmon module for Intel Goldmont.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -29,19 +29,19 @@
  * =======================================================================================
  */
 
-#include <perfmon_skylake_events.h>
-#include <perfmon_skylake_counters.h>
+#include <perfmon_goldmont_events.h>
+#include <perfmon_goldmont_counters.h>
 #include <error.h>
 #include <affinity.h>
 #include <limits.h>
 #include <topology.h>
 #include <access.h>
 
-static int perfmon_numCountersSkylake = NUM_COUNTERS_SKYLAKE;
-static int perfmon_numCoreCountersSkylake = NUM_COUNTERS_CORE_SKYLAKE;
-static int perfmon_numArchEventsSkylake = NUM_ARCH_EVENTS_SKYLAKE;
+static int perfmon_numCountersGoldmont = NUM_COUNTERS_GOLDMONT;
+static int perfmon_numCoreCountersGoldmont = NUM_COUNTERS_CORE_GOLDMONT;
+static int perfmon_numArchEventsGoldmont = NUM_ARCH_EVENTS_GOLDMONT;
 
-int perfmon_init_skylake(int cpu_id)
+int perfmon_init_goldmont(int cpu_id)
 {
     lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
     lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
@@ -49,7 +49,7 @@ int perfmon_init_skylake(int cpu_id)
     return 0;
 }
 
-uint32_t skl_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+uint32_t glm_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
     int j;
     uint32_t flags = (1ULL<<(1+(index*4)));
@@ -69,7 +69,7 @@ uint32_t skl_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
     return flags;
 }
 
-int skl_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+int glm_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
     int j;
     uint64_t flags = 0x0ULL;
@@ -154,44 +154,7 @@ int skl_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
     return 0;
 }
 
-int skl_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
-{
-    int j;
-    uint64_t flags = 0x0ULL;
-    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
-    {
-        return 0;
-    }
-    flags = (1ULL<<22)|(1ULL<<20);
-    flags |= (event->umask<<8) + event->eventId;
-    if (event->numberOfOptions > 0)
-    {
-        for(j = 0; j < event->numberOfOptions; j++)
-        {
-            switch (event->options[j].type)
-            {
-                case EVENT_OPTION_EDGE:
-                    flags |= (1ULL<<18);
-                    break;
-                case EVENT_OPTION_INVERT:
-                    flags |= (1ULL<<23);
-                    break;
-                case EVENT_OPTION_THRESHOLD:
-                    flags |= (event->options[j].value & 0x1FULL) << 24;
-                    break;
-            }
-        }
-    }
-    if (flags != currentConfig[cpu_id][index])
-    {
-        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
-        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
-        currentConfig[cpu_id][index] = flags;
-    }
-    return 0;
-}
-
-int perfmon_setupCounterThread_skylake(
+int perfmon_setupCounterThread_goldmont(
         int thread_id,
         PerfmonEventSet* eventSet)
 {
@@ -212,12 +175,6 @@ int perfmon_setupCounterThread_skylake(
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0xC00000070000000F));
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
     }
-    if (haveLock && eventSet->regTypeMask & ~(0xFULL))
-    {
-        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_UBOXFIX)
-        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL));
-        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, 0x0ULL));
-    }
 
     for (int i=0;i < eventSet->numberOfEvents;i++)
     {
@@ -234,38 +191,15 @@ int perfmon_setupCounterThread_skylake(
         switch (type)
         {
             case PMC:
-                skl_pmc_setup(cpu_id, index, event);
+                glm_pmc_setup(cpu_id, index, event);
                 break;
 
             case FIXED:
-                fixed_flags |= skl_fixed_setup(cpu_id, index, event);
+                fixed_flags |= glm_fixed_setup(cpu_id, index, event);
                 break;
 
             case POWER:
                 break;
-            case UBOXFIX:
-                if (haveLock)
-                {
-                    uint64_t uflags = 0x0ULL;
-                    uflags |= (1ULL<<20)|(1ULL<<22);
-                    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, uflags, SETUP_UBOXFIX)
-                    HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, uflags);
-                }
-                break;
-            case UBOX:
-                if (haveLock)
-                {
-                    uint64_t uflags = 0x0ULL;
-                    uflags |= (1ULL<<20)|(1ULL<<22);
-                    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, uflags, CLEAR_UBOX)
-                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, uflags));
-                }
-                break;
-            case CBOX0:
-            case CBOX1:
-            case CBOX2:
-            case CBOX3:
-                skl_cbox_setup(cpu_id, index, event);
                 break;
             default:
                 break;
@@ -279,7 +213,7 @@ int perfmon_setupCounterThread_skylake(
     return 0;
 }
 
-int perfmon_startCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
+int perfmon_startCountersThread_goldmont(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
     uint64_t flags = 0x0ULL;
@@ -329,29 +263,6 @@ int perfmon_startCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet
                         eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
                     }
                     break;
-                case UBOXFIX:
-                    if (haveLock)
-                    {
-                        VERBOSEPRINTREG(cpu_id, counter1, 0x0ULL, CLEAR_UBOXFIX)
-                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
-                    }
-                    break;
-                case UBOX:
-                    if (haveLock)
-                    {
-                        VERBOSEPRINTREG(cpu_id, counter1, 0x0ULL, CLEAR_UBOX)
-                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
-                    }
-                    break;
-                case CBOX0:
-                case CBOX1:
-                case CBOX2:
-                case CBOX3:
-                    if (haveLock)
-                    {
-                        uflags |= (1ULL<<(type-CBOX0));
-                    }
-                    break;
                 default:
                     break;
             }
@@ -359,12 +270,6 @@ int perfmon_startCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet
         }
     }
 
-    if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
-    {
-        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags|(1ULL<<29), UNFREEZE_UBOXFIX)
-        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags|(1ULL<<29)));
-    }
-
     if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
         VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST (1ULL<<63)|(1ULL<<62)|flags, CLEAR_PMC_AND_FIXED_OVERFLOW)
@@ -377,7 +282,7 @@ int perfmon_startCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet
 }
 
 
-#define SKL_CHECK_CORE_OVERFLOW(offset) \
+#define GLM_CHECK_CORE_OVERFLOW(offset) \
     if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
     { \
         uint64_t ovf_values = 0x0ULL; \
@@ -389,18 +294,7 @@ int perfmon_startCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<offset))); \
     }
 
-#define SKL_CHECK_UNCORE_OVERFLOW(offset) \
-    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
-    { \
-        uint64_t ovf_values = 0x0ULL; \
-        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, &ovf_values)); \
-        if (ovf_values & (1ULL<<offset)) \
-        { \
-            eventSet->events[i].threadCounter[thread_id].overflows++; \
-        } \
-    }
-
-#define SKL_CHECK_LOCAL_OVERFLOW \
+#define GLM_CHECK_LOCAL_OVERFLOW \
     if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
     { \
         uint64_t ovf_values = 0x0ULL; \
@@ -413,7 +307,7 @@ int perfmon_startCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet
         } \
     }
 
-int perfmon_stopCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
+int perfmon_stopCountersThread_goldmont(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
     uint64_t counter_result = 0x0ULL;
@@ -429,11 +323,6 @@ int perfmon_stopCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
         VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
     }
-    if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
-    {
-        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_UBOXFIX)
-        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL));
-    }
 
     for (int i=0;i < eventSet->numberOfEvents;i++)
     {
@@ -455,14 +344,14 @@ int perfmon_stopCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
             {
                 case PMC:
                     CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
-                    SKL_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+                    GLM_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
                     VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
                     *current = field64(counter_result, 0, box_map[type].regWidth);
                     break;
 
                 case FIXED:
                     CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
-                    SKL_CHECK_CORE_OVERFLOW(index+32);
+                    GLM_CHECK_CORE_OVERFLOW(index+32);
                     VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
                     *current = field64(counter_result, 0, box_map[type].regWidth);
                     break;
@@ -484,56 +373,18 @@ int perfmon_stopCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
                     CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
                     *current = field64(counter_result, 0, box_map[type].regWidth);
                     break;
-                
-                case UBOXFIX:
-                    if (haveLock)
-                    {
-                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
-                        SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
-                        *current = field64(counter_result, 0, 44);
-                    }
-                    break;
-                case UBOX:
-                    if (haveLock)
-                    {
-                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
-                        SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
-                        *current = field64(counter_result, 0, 44);
-                    }
-                    break;
-                case CBOX0:
-                case CBOX1:
-                case CBOX2:
-                case CBOX3:
-                    if (haveLock)
-                    {
-                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
-                        SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
-                        *current = field64(counter_result, 0, box_map[type].regWidth);
-                    }
-                    break;
 
                 default:
                     break;
             }
         }
-        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
-    if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
-    {
-        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, &counter_result));
-        if (counter_result != 0x0ULL)
-        {
-            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, counter_result));
-        }
-    }
-    
 
     return 0;
 }
 
 
-int perfmon_readCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
+int perfmon_readCountersThread_goldmont(int thread_id, PerfmonEventSet* eventSet)
 {
     uint64_t flags = 0x0ULL;
     uint64_t uflags = 0x0ULL;
@@ -554,14 +405,6 @@ int perfmon_readCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
         VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
     }
 
-    if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
-    {
-        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, &uflags));
-        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, LLU_CAST uflags, SAFE_UBOXFIX_FLAGS)
-        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL));
-        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL, RESET_UBOXFIX_FLAGS)
-    }
-
     for (int i=0;i < eventSet->numberOfEvents;i++)
     {
         if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
@@ -582,14 +425,14 @@ int perfmon_readCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
             {
                 case PMC:
                     CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
-                    SKL_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+                    GLM_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
                     VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
                     eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
                     break;
 
                 case FIXED:
                     CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
-                    SKL_CHECK_CORE_OVERFLOW(index+32);
+                    GLM_CHECK_CORE_OVERFLOW(index+32);
                     VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
                     eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
                     break;
@@ -607,54 +450,12 @@ int perfmon_readCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
                     }
                     break;
 
-                case THERMAL:
-                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
-                    eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
-                    break;
-                
-                case UBOXFIX:
-                    if (haveLock)
-                    {
-                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
-                        SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
-                        *current = field64(counter_result, 0, box_map[type].regWidth);
-                    }
-                    break;
-                case UBOX:
-                    if (haveLock)
-                    {
-                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
-                        SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
-                        *current = field64(counter_result, 0, box_map[type].regWidth);
-                    }
-                    break;
-                case CBOX0:
-                case CBOX1:
-                case CBOX2:
-                case CBOX3:
-                    if (haveLock)
-                    {
-                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
-                        SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
-                        *current = field64(counter_result, 0, box_map[type].regWidth);
-                    }
-                    break;
 
                 default:
                     break;
             }
         }
     }
-    if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
-    {
-        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, &counter_result));
-        if (counter_result != 0x0ULL)
-        {
-            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, counter_result));
-        }
-        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags));
-        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags, RESET_UBOXFIX_FLAGS)
-    }
 
     if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
@@ -665,7 +466,7 @@ int perfmon_readCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
     return 0;
 }
 
-int perfmon_finalizeCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
+int perfmon_finalizeCountersThread_goldmont(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
     int haveTileLock = 0;
@@ -731,16 +532,16 @@ int perfmon_finalizeCountersThread_skylake(int thread_id, PerfmonEventSet* event
             {
                 CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
             }
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
+            if (counter_map[index].counterRegister2 != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL));
+            }
         }
         eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
-    if (haveLock && eventSet->regTypeMask & ~(0xFULL))
-    {
-        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values_UBOXFIX, CLEAR_UBOXFIX_OVF)
-        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, ovf_values_UBOXFIX));
-        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UBOXFIX_CTRL)
-        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, 0x0ULL));
-    }
 
     if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
diff --git a/src/includes/perfmon_goldmont_counters.h b/src/includes/perfmon_goldmont_counters.h
new file mode 100644
index 0000000..f996cbc
--- /dev/null
+++ b/src/includes/perfmon_goldmont_counters.h
@@ -0,0 +1,65 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_goldmont_counters.h
+ *
+ *      Description:  Counter Header File of perfmon module for Intel Goldmont.
+ *
+ *      Version:   4.1
+ *      Released:  8.8.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_GOLDMONT 12
+#define NUM_COUNTERS_CORE_GOLDMONT 8
+#define NUM_COUNTERS_UNCORE_GOLDMONT 12
+
+#define GLM_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define GLM_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap goldmont_counter_map[NUM_COUNTERS_GOLDMONT] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, GLM_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, GLM_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, GLM_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, GLM_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, GLM_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, GLM_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, GLM_VALID_OPTIONS_PMC},
+    /* Temperature Sensor*/
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* RAPL counters */
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+};
+
+
+static BoxMap goldmont_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_V4_PERF_GLOBAL_STATUS, MSR_V4_PERF_GLOBAL_STATUS_RESET, 0, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+    [FIXED] =  {MSR_PERF_GLOBAL_CTRL, MSR_V4_PERF_GLOBAL_STATUS, MSR_V4_PERF_GLOBAL_STATUS_RESET, 0, 0, 0, 48},
+    [POWER] = {0, 0, 0, 0, 0, 0, 32},
+};
diff --git a/src/includes/perfmon_goldmont_events.txt b/src/includes/perfmon_goldmont_events.txt
new file mode 100644
index 0000000..08218c3
--- /dev/null
+++ b/src/includes/perfmon_goldmont_events.txt
@@ -0,0 +1,211 @@
+# =======================================================================================
+#
+#      Filename:  perfmon_goldmont_events.txt
+#
+#      Description:  Event list for Intel Goldmont
+#
+#      Version:   4.1
+#      Released:  8.8.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE          0x00   TMP0
+UMASK_TEMP_CORE          0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
+EVENT_PWR_DRAM_ENERGY          0x00   PWR3
+UMASK_PWR_DRAM_ENERGY          0x00
+
+EVENT_INSTR_RETIRED              0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY          0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE      0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF       0x00
+
+EVENT_LD_BLOCKS                         0x03 PMC
+UMASK_LD_BLOCKS_DATA_UNKNOWN            0x01
+UMASK_LD_BLOCKS_STORE_FORWARD           0x02
+UMASK_LD_BLOCKS_4K_ALIAS                0x04
+UMASK_LD_BLOCKS_UTLB_MISS               0x08
+UMASK_LD_BLOCKS_ALL_BLOCK               0x10
+
+EVENT_PAGE_WALKS                    0x05 PMC
+UMASK_PAGE_WALKS_D_SIDE_CYCLES      0x01
+DEFAULT_OPTIONS_PAGE_WALKS_D_SIDE_COUNT EVENT_OPTION_EDGE=1
+UMASK_PAGE_WALKS_D_SIDE_COUNT       0x01
+UMASK_PAGE_WALKS_I_SIDE_CYCLES      0x02
+DEFAULT_OPTIONS_PAGE_WALKS_I_SIDE_COUNT EVENT_OPTION_EDGE=1
+UMASK_PAGE_WALKS_I_SIDE_COUNT       0x02
+UMASK_PAGE_WALKS_CYCLES             0x03
+DEFAULT_OPTIONS_PAGE_WALKS_COUNT EVENT_OPTION_EDGE=1
+UMASK_PAGE_WALKS_COUNT              0x03
+
+EVENT_UOPS_ISSUED                0x0E  PMC
+UMASK_UOPS_ISSUED_ANY            0x00
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES    0x00
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES   0x00
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES   0x00
+
+EVENT_MISALIGN_MEM_REF                  0x13 PMC
+UMASK_MISALIGN_MEM_REF_LOAD_PAGE_SPLIT  0x02
+UMASK_MISALIGN_MEM_REF_STORE_PAGE_SPLIT 0x04
+
+EVENT_LONGEST_LAT_CACHE                 0x2E PMC
+UMASK_LONGEST_LAT_CACHE_MISS            0x41
+UMASK_LONGEST_LAT_CACHE_REFERENCE       0x4F
+
+EVENT_L2_REJECT_XQ                      0x30 PMC
+UMASK_L2_REJECT_XQ_ALL                  0x00
+
+EVENT_CORE_REJECT_L2Q                   0x31 PMC
+UMASK_CORE_REJECT_L2Q_ALL               0x00
+
+EVENT_CPU_CLOCK_UNHALTED                0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P       0x00
+UMASK_CPU_CLOCK_UNHALTED_REF            0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES   0x00
+
+EVENT_DL1_DIRTY_EVICTION                0x51 PMC
+UMASK_DL1_DIRTY_EVICTION                0x01
+
+EVENT_ICACHE                            0x80 PMC
+UMASK_ICACHE_HIT                        0x01
+UMASK_ICACHE_MISSES                     0x02
+UMASK_ICACHE_ACCESSES                   0x03
+
+EVENT_ITLB_MISS                         0x81 PMC
+UMASK_ITLB_MISS                         0x04
+
+EVENT_FETCH_STALL                               0x86 PMC
+UMASK_FETCH_STALL_ICACHE_FILL_PENDING_CYCLES    0x02
+DEFAULT_OPTIONS_FETCH_STALL_ICACHE_FILL_PENDING_COUNT EVENT_OPTION_EDGE=1
+UMASK_FETCH_STALL_ICACHE_FILL_PENDING_COUNT     0x02
+
+EVENT_UOPS_NOT_DELIVERED_ANY            0x9C PMC
+UMASK_UOPS_NOT_DELIVERED_ANY            0x00
+
+EVENT_INST_RETIRED                      0xC0 PMC
+UMASK_INST_RETIRED_ANY                  0x00
+UMASK_INST_RETIRED_MS                   0x01
+
+EVENT_MACHINE_CLEARS                    0xC3 PMC
+UMASK_MACHINE_CLEARS_ALL                0x00
+UMASK_MACHINE_CLEARS_SMC                0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
+UMASK_MACHINE_CLEARS_FP_ASSIST          0x04
+UMASK_MACHINE_CLEARS_DISAMBIGUATION     0x08
+
+EVENT_BR_INST_RETIRED                   0xC4 PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES      0x00
+UMASK_BR_INST_RETIRED_JCC               0x7E
+UMASK_BR_INST_RETIRED_TAKEN_JCC         0xFE
+UMASK_BR_INST_RETIRED_CALL              0xF9
+UMASK_BR_INST_RETIRED_REL_CALL          0xFD
+UMASK_BR_INST_RETIRED_IND_CALL          0xFB
+UMASK_BR_INST_RETIRED_RETURN            0xF7
+UMASK_BR_INST_RETIRED_NON_RETURN_IND    0xEB
+UMASK_BR_INST_RETIRED_FAR_BRANCH        0xBF
+
+EVENT_BR_MISP_RETIRED                   0xC5 PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES      0x00
+UMASK_BR_MISP_RETIRED_JCC               0x7E
+UMASK_BR_MISP_RETIRED_TAKEN_JCC         0xFE
+UMASK_BR_MISP_RETIRED_IND_CALL          0xFB
+UMASK_BR_MISP_RETIRED_RETURN            0xF7
+UMASK_BR_MISP_RETIRED_NON_RETURN_IND    0xEB
+
+EVENT_ISSUE_SLOTS_NOT_CONSUMED                  0xCA PMC
+UMASK_ISSUE_SLOTS_NOT_CONSUMED_ANY              0x00
+UMASK_ISSUE_SLOTS_NOT_CONSUMED_RESOURCE_FULL    0x01
+UMASK_ISSUE_SLOTS_NOT_CONSUMED_RECOVERY         0x02
+
+EVENT_HW_INTERRUPTS                     0xCB PMC
+UMASK_HW_INTERRUPTS_RECEIVED            0x01
+UMASK_HW_INTERRUPTS_PENDING_AND_MASKED  0x04
+DEFAULT_OPTIONS_HW_INTERRUPTS_PENDING_AND_MASKED_COUNT EVENT_OPTION_EDGE=1
+UMASK_HW_INTERRUPTS_PENDING_AND_MASKED_COUNT  0x04
+
+EVENT_CYCLES_DIV_BUSY                   0xCD PMC
+UMASK_CYCLES_DIV_BUSY_ALL               0x00
+UMASK_CYCLES_DIV_BUSY_IDIV              0x01
+UMASK_CYCLES_DIV_BUSY_FPDIV             0x02
+DEFAULT_OPTIONS_CYCLES_DIV_BUSY_ALL_COUNT EVENT_OPTION_EDGE=1
+UMASK_CYCLES_DIV_BUSY_ALL_COUNT         0x00
+DEFAULT_OPTIONS_CYCLES_DIV_BUSY_IDIV_COUNT EVENT_OPTION_EDGE=1
+UMASK_CYCLES_DIV_BUSY_IDIV_COUNT        0x01
+DEFAULT_OPTIONS_CYCLES_DIV_BUSY_FPDIV_COUNT EVENT_OPTION_EDGE=1
+UMASK_CYCLES_DIV_BUSY_FPDIV_COUNT       0x02
+
+EVENT_MEM_UOPS_RETIRED                  0xD0 PMC
+UMASK_MEM_UOPS_RETIRED_ALL_LOADS        0x81
+UMASK_MEM_UOPS_RETIRED_ALL_STORES       0x82
+UMASK_MEM_UOPS_RETIRED_ALL              0x83
+UMASK_MEM_UOPS_RETIRED_DTLB_MISS_LOADS  0x11
+UMASK_MEM_UOPS_RETIRED_DTLB_MISS_STORES 0x12
+UMASK_MEM_UOPS_RETIRED_DTLB_MISS        0x13
+UMASK_MEM_UOPS_RETIRED_LOCK_LOADS       0x21
+UMASK_MEM_UOPS_RETIRED_SPLIT_LOADS      0x41
+UMASK_MEM_UOPS_RETIRED_SPLIT_STORES     0x42
+UMASK_MEM_UOPS_RETIRED_SPLIT_ALL        0x43
+
+EVENT_MEM_LOAD_UOPS_RETIRED             0xD1 PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT      0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS     0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT      0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS     0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_HITM        0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_WCB_HIT     0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_DRAM_HIT    0x80
+
+EVENT_BACLEARS                      0xE6 PMC
+UMASK_BACLEARS_ALL                  0x01
+UMASK_BACLEARS_RETURN               0x08
+UMASK_BACLEARS_COND                 0x10
+
+EVENT_MS_DECODED_MS_ENTRY           0xE7 PMC
+UMASK_MS_DECODED_MS_ENTRY           0x01
+
+EVENT_DECODE_RESTRICTION                 0xE9 PMC
+UMASK_DECODE_RESTRICTION_PREDECODE_WRONG 0x01
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+
diff --git a/src/includes/perfmon_haswell.h b/src/includes/perfmon_haswell.h
index 23d1b64..b364155 100644
--- a/src/includes/perfmon_haswell.h
+++ b/src/includes/perfmon_haswell.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of perfmon module for Intel Haswell.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -1164,6 +1164,14 @@ int perfmon_setupCounterThread_haswell(
                 break;
         }
     }
+    for (int i=UNCORE;i<NUM_UNITS;i++)
+    {
+        if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(i))) && box_map[i].ctrlRegister != 0x0)
+        {
+            VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL, CLEAR_UNCORE_BOX_CTRL);
+            HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+        }
+    }
     if (fixed_flags > 0x0ULL)
     {
         // Erratum HSW143
@@ -1255,7 +1263,7 @@ int perfmon_startCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet
     }
 
     HASEP_UNFREEZE_UNCORE_AND_RESET_CTR;
-    
+
     if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
         VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST (1ULL<<63)|(1ULL<<62)|flags, CLEAR_PMC_AND_FIXED_OVERFLOW)
@@ -1565,23 +1573,23 @@ int perfmon_stopCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
                                 counter_result = 0;
                                 break;
                         }
-                        
+
                     }
                     else if ((eventSet->events[i].event.eventId == 0x01) ||
                              (eventSet->events[i].event.eventId == 0x02))
                     {
                         HPMread(cpu_id, dev, counter1, &counter_result);
-                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_QBOXFIX);
                         counter_result = field64(counter_result, 0, box_map[type].regWidth);
                     }
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_QBOXFIX);
                     eventSet->events[i].threadCounter[thread_id].counterData = counter_result;
+                    eventSet->events[i].threadCounter[thread_id].startData = 0;
                     break;
 
                 default:
                     break;
             }
         }
-        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
 
 
@@ -1652,8 +1660,7 @@ int perfmon_readCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
                         VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_POWER)
                         if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
                         {
-                            VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST eventSet->events[i].threadCounter[thread_id].startData, OVERFLOW_POWER_START)
-                            VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, OVERFLOW_POWER_STOP)
+                            VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, OVERFLOW_POWER)
                             eventSet->events[i].threadCounter[thread_id].overflows++;
                         }
                         *current = field64(counter_result, 0, box_map[type].regWidth);
@@ -1782,7 +1789,7 @@ int perfmon_readCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
                                 counter_result = 0;
                                 break;
                         }
-                        
+
                     }
                     else if ((eventSet->events[i].event.eventId == 0x01) ||
                              (eventSet->events[i].event.eventId == 0x02))
@@ -1876,15 +1883,12 @@ int perfmon_finalizeCountersThread_haswell(int thread_id, PerfmonEventSet* event
             {
                 CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
             }
-            if (box_map[type].filterRegister1)
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
+            if (counter_map[index].counterRegister2 != 0x0)
             {
-                VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].filterRegister1, 0x0ULL, CLEAR_FILTER);
-                HPMwrite(cpu_id, dev, box_map[type].filterRegister1, 0x0ULL);
-            }
-            if (box_map[type].filterRegister2)
-            {
-                VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].filterRegister2, 0x0ULL, CLEAR_FILTER);
-                HPMwrite(cpu_id, dev, box_map[type].filterRegister2, 0x0ULL);
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL));
             }
         }
         eventSet->events[i].threadCounter[thread_id].init = FALSE;
@@ -1895,6 +1899,24 @@ int perfmon_finalizeCountersThread_haswell(int thread_id, PerfmonEventSet* event
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, ovf_values_uncore));
         VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UNCORE_CTRL)
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, 0x0ULL));
+        for (int i=UNCORE;i<NUM_UNITS;i++)
+        {
+            if ((eventSet->regTypeMask & (REG_TYPE_MASK(i))) && box_map[i].ctrlRegister != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL, CLEAR_UNCORE_BOX_CTRL);
+                HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+                if (box_map[i].filterRegister1)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL);
+                }
+                if (box_map[i].filterRegister2)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL);
+                }
+            }
+        }
     }
 
     if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
diff --git a/src/includes/perfmon_haswellEP_counters.h b/src/includes/perfmon_haswellEP_counters.h
index 0c93c91..af4d524 100644
--- a/src/includes/perfmon_haswellEP_counters.h
+++ b/src/includes/perfmon_haswellEP_counters.h
@@ -6,7 +6,7 @@
  *      Description:  Counter Header File of perfmon module for Intel Haswell EP/EN/EX.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -300,8 +300,8 @@ static BoxMap haswellEP_box_map[NUM_UNITS] = {
     [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 28, 1, PCI_R3QPI_DEVICE_LINK_1, 44},
     [QBOX0] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 25, 1, PCI_QPI_DEVICE_PORT_0, 48},
     [QBOX1] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 26, 1, PCI_QPI_DEVICE_PORT_1, 48},
-    [QBOX0FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_0, 32},
-    [QBOX1FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_1, 32},
+    [QBOX0FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_0, 64},
+    [QBOX1FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_1, 64},
 };
 
 static PciDevice haswellEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
diff --git a/src/includes/perfmon_haswellEP_events.txt b/src/includes/perfmon_haswellEP_events.txt
index fb078a1..5ea1ad1 100644
--- a/src/includes/perfmon_haswellEP_events.txt
+++ b/src/includes/perfmon_haswellEP_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Haswell EP/EN/EX
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_haswell_counters.h b/src/includes/perfmon_haswell_counters.h
index 4964994..8685420 100644
--- a/src/includes/perfmon_haswell_counters.h
+++ b/src/includes/perfmon_haswell_counters.h
@@ -6,7 +6,7 @@
  *      Description:  Counter Header File of perfmon module for Intel Haswell.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_haswell_events.txt b/src/includes/perfmon_haswell_events.txt
index bc5a37d..8ada1d0 100644
--- a/src/includes/perfmon_haswell_events.txt
+++ b/src/includes/perfmon_haswell_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Haswell
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_interlagos.h b/src/includes/perfmon_interlagos.h
index b922ce2..464a1af 100644
--- a/src/includes/perfmon_interlagos.h
+++ b/src/includes/perfmon_interlagos.h
@@ -6,7 +6,7 @@
  *      Description:  Header file of perfmon module for AMD Interlagos
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -295,6 +295,8 @@ int perfmon_finalizeCountersThread_interlagos(int thread_id, PerfmonEventSet* ev
         {
             VERBOSEPRINTREG(cpu_id, reg, LLU_CAST 0x0ULL, CLEAR_CTRL);
             CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+            VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST 0x0ULL, CLEAR_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].counterRegister, 0x0ULL));
         }
         eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
diff --git a/src/includes/perfmon_interlagos_counters.h b/src/includes/perfmon_interlagos_counters.h
index 5f7ac2f..c3e1702 100644
--- a/src/includes/perfmon_interlagos_counters.h
+++ b/src/includes/perfmon_interlagos_counters.h
@@ -6,7 +6,7 @@
  *      Description:  Counter Header File of perfmon module for AMD Interlagos
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_interlagos_events.txt b/src/includes/perfmon_interlagos_events.txt
index 3a79497..0847b2f 100644
--- a/src/includes/perfmon_interlagos_events.txt
+++ b/src/includes/perfmon_interlagos_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for AMD Interlagos
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_ivybridge.h b/src/includes/perfmon_ivybridge.h
index 19e03d9..e0c9616 100644
--- a/src/includes/perfmon_ivybridge.h
+++ b/src/includes/perfmon_ivybridge.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of perfmon module for Intel Ivy Bridge.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -48,6 +48,7 @@ static int perfmon_numArchEventsIvybridge = NUM_ARCH_EVENTS_IVYBRIDGE;
 
 int ivb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
 int ivbep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int ivy_cbox_nosetup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
 int (*ivy_cbox_setup)(int, RegisterIndex, PerfmonEvent*);
 
 int perfmon_init_ivybridge(int cpu_id)
@@ -69,6 +70,10 @@ int perfmon_init_ivybridge(int cpu_id)
     {
         ivy_cbox_setup = ivb_cbox_setup;
     }
+    else
+    {
+        ivy_cbox_setup = ivy_cbox_nosetup;
+    }
     return 0;
 }
 
@@ -388,6 +393,12 @@ int ivb_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event, PciDevi
     return 0;
 }
 
+int ivy_cbox_nosetup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    return 0;
+}
+
+
 int ivb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
     uint32_t flags = 0x0UL;
@@ -662,7 +673,7 @@ int ivb_ibox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 }
 
 
-int ivb_uncore_freeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
+int ivb_uncore_freeze(int cpu_id, PerfmonEventSet* eventSet)
 {
     uint32_t freeze_reg = (cpuid_info.model == IVYBRIDGE_EP ? MSR_UNC_U_PMON_GLOBAL_CTL : MSR_UNC_PERF_GLOBAL_CTRL);
     if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
@@ -674,31 +685,10 @@ int ivb_uncore_freeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
         VERBOSEPRINTREG(cpu_id, freeze_reg, LLU_CAST (1ULL<<31), FREEZE_UNCORE);
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, freeze_reg, (1ULL<<31)));
     }
-    if ((flags != FREEZE_FLAG_ONLYFREEZE) && (eventSet->regTypeMask & ~(0xF)))
-    {
-        for (int j=UNCORE; j<NUM_UNITS; j++)
-        {
-            if (eventSet->regTypeMask & REG_TYPE_MASK(j))
-            {
-                if ((box_map[j].ctrlRegister != 0x0) && (box_map[j].isPci))
-                {
-                    DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
-                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[j].device,
-                                                    box_map[j].ctrlRegister, flags));
-                }
-                else if (box_map[j].ctrlRegister != 0x0)
-                {
-                    DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
-                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
-                                                     box_map[j].ctrlRegister, flags));
-                }
-            }
-        }
-    }
     return 0;
 }
 
-int ivb_uncore_unfreeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
+int ivb_uncore_unfreeze(int cpu_id, PerfmonEventSet* eventSet)
 {
     uint32_t unfreeze_reg = (cpuid_info.model == IVYBRIDGE_EP ? MSR_UNC_U_PMON_GLOBAL_CTL : MSR_UNC_PERF_GLOBAL_CTRL);
     uint32_t ovf_reg = (cpuid_info.model == IVYBRIDGE_EP ? MSR_UNC_U_PMON_GLOBAL_STATUS : MSR_UNC_PERF_GLOBAL_OVF_CTRL);
@@ -706,27 +696,6 @@ int ivb_uncore_unfreeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
     {
         return 0;
     }
-    if ((flags != FREEZE_FLAG_ONLYFREEZE) && (eventSet->regTypeMask & ~(0xF)))
-    {
-        for (int j=UNCORE; j<NUM_UNITS; j++)
-        {
-            if (eventSet->regTypeMask & REG_TYPE_MASK(j))
-            {
-                if ((box_map[j].ctrlRegister != 0x0) && (box_map[j].isPci))
-                {
-                    DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
-                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[j].device,
-                                                    box_map[j].ctrlRegister, flags));
-                }
-                else if (box_map[j].ctrlRegister != 0x0)
-                {
-                    DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
-                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
-                                                     box_map[j].ctrlRegister, flags));
-                }
-            }
-        }
-    }
     if (eventSet->regTypeMask & ~(0xF))
     {
         VERBOSEPRINTREG(cpu_id, ovf_reg, LLU_CAST 0x0ULL, CLEAR_UNCORE_OVF)
@@ -757,7 +726,7 @@ int perfmon_setupCounterThread_ivybridge(
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
     }
 
-    ivb_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
+    ivb_uncore_freeze(cpu_id, eventSet);
 
     for (int i=0;i < eventSet->numberOfEvents;i++)
     {
@@ -769,7 +738,7 @@ int perfmon_setupCounterThread_ivybridge(
         RegisterIndex index = eventSet->events[i].index;
         PerfmonEvent *event = &(eventSet->events[i].event);
         eventSet->events[i].threadCounter[thread_id].init = TRUE;
-        switch (eventSet->events[i].type)
+        switch (type)
         {
             case PMC:
                 ivb_pmc_setup(cpu_id, index, event);
@@ -860,6 +829,14 @@ int perfmon_setupCounterThread_ivybridge(
                 break;
         }
     }
+    for (int i=UNCORE;i<NUM_UNITS;i++)
+    {
+        if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(i))) && box_map[i].ctrlRegister != 0x0)
+        {
+            VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL, CLEAR_UNCORE_BOX_CTRL);
+            HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+        }
+    }
     if (fixed_flags > 0x0)
     {
         VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
@@ -893,6 +870,7 @@ int perfmon_startCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventS
             tmp = 0x0ULL;
             RegisterIndex index = eventSet->events[i].index;
             uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t counter2 = counter_map[index].counterRegister2;
             eventSet->events[i].threadCounter[thread_id].startData = 0;
             eventSet->events[i].threadCounter[thread_id].counterData = 0;
             switch (type)
@@ -923,13 +901,22 @@ int perfmon_startCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventS
                     break;
 
                 default:
+                    if (eventSet->regTypeMask & REG_TYPE_MASK(type))
+                    {
+                        if (counter1 != 0x0)
+                        {
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, box_map[type].device, counter1, 0x0ULL));
+                            if (counter2 != 0x0)
+                                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, box_map[type].device, counter2, 0x0ULL));
+                        }
+                    }
                     break;
             }
             eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
         }
     }
 
-    ivb_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTR);
+    ivb_uncore_unfreeze(cpu_id, eventSet);
 
     if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
@@ -1065,7 +1052,7 @@ int perfmon_stopCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSe
         VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
     }
-    ivb_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTL);
+    ivb_uncore_freeze(cpu_id, eventSet);
 
     for (int i=0;i < eventSet->numberOfEvents;i++)
     {
@@ -1160,6 +1147,7 @@ int perfmon_stopCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSe
                                 break;
                         }
                         VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED_REAL)
+                        eventSet->events[i].threadCounter[thread_id].startData = 0;
                     }
                     break;
 
@@ -1234,7 +1222,6 @@ int perfmon_stopCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSe
             }
             *current = field64(counter_result, 0, box_map[type].regWidth);
         }
-        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
 
     return 0;
@@ -1257,7 +1244,7 @@ int perfmon_readCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSe
         CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
     }
-    ivb_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
+    ivb_uncore_freeze(cpu_id, eventSet);
 
     for (int i=0;i < eventSet->numberOfEvents;i++)
     {
@@ -1327,29 +1314,36 @@ int perfmon_readCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSe
                     {
                         CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
                         VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED)
-                        switch (extractBitField(counter_result,3,0))
+                        if (eventSet->events[i].event.eventId == 0x00)
                         {
-                            case 0x2:
-                                counter_result = 5600000000ULL;
-                                break;
-                            case 0x3:
-                                counter_result = 6400000000ULL;
-                                break;
-                            case 0x4:
-                                counter_result = 7200000000ULL;
-                                break;
-                            case 0x5:
-                                counter_result = 8000000000ULL;
-                                break;
-                            case 0x6:
-                                counter_result = 8800000000ULL;
-                                break;
-                            case 0x7:
-                                counter_result = 9600000000ULL;
-                                break;
-                            default:
-                                counter_result = 0x0ULL;
-                                break;
+                            switch (extractBitField(counter_result,3,0))
+                            {
+                                case 0x2:
+                                    counter_result = 5600000000ULL;
+                                    break;
+                                case 0x3:
+                                    counter_result = 6400000000ULL;
+                                    break;
+                                case 0x4:
+                                    counter_result = 7200000000ULL;
+                                    break;
+                                case 0x5:
+                                    counter_result = 8000000000ULL;
+                                    break;
+                                case 0x6:
+                                    counter_result = 8800000000ULL;
+                                    break;
+                                case 0x7:
+                                    counter_result = 9600000000ULL;
+                                    break;
+                                default:
+                                    counter_result = 0x0ULL;
+                                    break;
+                            }
+                        }
+                        else if (eventSet->events[i].event.eventId == 0x01)
+                        {
+                            counter_result = extractBitField(counter_result,1,4);
                         }
                         VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED_REAL)
                         eventSet->events[i].threadCounter[thread_id].startData = 0;
@@ -1430,7 +1424,7 @@ int perfmon_readCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSe
         }
     }
 
-    ivb_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
+    ivb_uncore_unfreeze(cpu_id, eventSet);
     if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
@@ -1457,47 +1451,65 @@ int perfmon_finalizeCountersThread_ivybridge(int thread_id, PerfmonEventSet* eve
 
     for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            RegisterType type = eventSet->events[i].type;
-            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
-            {
-                continue;
-            }
-            RegisterIndex index = eventSet->events[i].index;
-            PciDeviceIndex dev = counter_map[index].device;
-            uint64_t reg = counter_map[index].configRegister;
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PciDeviceIndex dev = counter_map[index].device;
+        uint64_t reg = counter_map[index].configRegister;
 
-            switch(type)
+        switch(type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                }
+                else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            default:
+                break;
+        }
+        if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+        {
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            if (type >= SBOX0 && type <= SBOX2)
             {
-                case PMC:
-                    ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
-                    if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
-                    {
-                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
-                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
-                    }
-                    else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
-                    {
-                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
-                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
-                    }
-                    break;
-                case FIXED:
-                    ovf_values_core |= (1ULL<<(index+32));
-                    break;
-                default:
-                    break;
+                VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL_TWICE);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
             }
-            if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+            CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
+            if (type >= SBOX0 && type <= SBOX2)
             {
-                VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
-                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
-                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR_TWICE);
                 CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
             }
-            eventSet->events[i].threadCounter[thread_id].init = FALSE;
+            if (counter_map[index].counterRegister2 != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL));
+                if (type >= SBOX0 && type <= SBOX2)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR_TWICE);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL));
+                }
+            }
+            
         }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
     if (haveLock && eventSet->regTypeMask & ~(0xFULL))
     {
@@ -1505,6 +1517,26 @@ int perfmon_finalizeCountersThread_ivybridge(int thread_id, PerfmonEventSet* eve
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_STATUS, 0x0ULL));
         VERBOSEPRINTREG(cpu_id, MSR_UNC_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UNCORE_CTRL)
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_CTL, 0x0ULL));
+        for (int i=UNCORE;i<NUM_UNITS;i++)
+        {
+            if ((eventSet->regTypeMask & (REG_TYPE_MASK(i))) && box_map[i].ctrlRegister != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL, CLEAR_UNCORE_BOX_CTRL);
+                HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+                if (i >= SBOX0 && i <= SBOX2)
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+                if (box_map[i].filterRegister1)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL);
+                }
+                if (box_map[i].filterRegister2)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL);
+                }
+            }
+        }
     }
 
     if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
diff --git a/src/includes/perfmon_ivybridgeEP_counters.h b/src/includes/perfmon_ivybridgeEP_counters.h
index 896530c..dc32f9e 100644
--- a/src/includes/perfmon_ivybridgeEP_counters.h
+++ b/src/includes/perfmon_ivybridgeEP_counters.h
@@ -6,7 +6,7 @@
  *      Description: Counter header file of perfmon module for Intel Ivy Bridge EP.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_ivybridgeEP_events.txt b/src/includes/perfmon_ivybridgeEP_events.txt
index e71e1cf..fe77350 100644
--- a/src/includes/perfmon_ivybridgeEP_events.txt
+++ b/src/includes/perfmon_ivybridgeEP_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Ivy Bridge EP/EN/EX
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -447,6 +447,9 @@ UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT          0x02
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM         0x04
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE         0x08
 
+EVENT_MEM_LOAD_UOPS_MISC_RETIRED               0xD4   PMC
+UMASK_MEM_LOAD_UOPS_MISC_RETIRED_LLC_MISS      0x02
+
 EVENT_BACLEARS               0xE6   PMC
 UMASK_BACLEARS_ANY           0x1F
 
@@ -996,6 +999,9 @@ UMASK_WR_CAS_RANK7_BANK7           0x80
 EVENT_QPI_RATE                     0x00    SBOX0FIX|SBOX1FIX|SBOX2FIX
 UMASK_QPI_RATE                     0x00
 
+EVENT_QPI_SLOW_MODE                0x01    SBOX0FIX|SBOX1FIX|SBOX2FIX
+UMASK_QPI_SLOW_MODE                0x00
+
 EVENT_SBOX_CLOCKTICKS               0x14 SBOX0|SBOX1|SBOX2
 UMASK_SBOX_CLOCKTICKS               0x00
 
diff --git a/src/includes/perfmon_ivybridge_counters.h b/src/includes/perfmon_ivybridge_counters.h
index 742b230..d28f86a 100644
--- a/src/includes/perfmon_ivybridge_counters.h
+++ b/src/includes/perfmon_ivybridge_counters.h
@@ -6,7 +6,7 @@
  *      Description: Counter header file of perfmon module for Intel Ivy Bridge.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_ivybridge_events.txt b/src/includes/perfmon_ivybridge_events.txt
index 1ff619a..99a5011 100644
--- a/src/includes/perfmon_ivybridge_events.txt
+++ b/src/includes/perfmon_ivybridge_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Ivy Bridge
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -450,6 +450,9 @@ UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT          0x02
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM         0x04
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE         0x08
 
+EVENT_MEM_LOAD_UOPS_MISC_RETIRED               0xD4   PMC
+UMASK_MEM_LOAD_UOPS_MISC_RETIRED_LLC_MISS      0x02
+
 EVENT_BACLEARS               0xE6   PMC
 UMASK_BACLEARS_ANY           0x1F
 
diff --git a/src/includes/perfmon_k10.h b/src/includes/perfmon_k10.h
index 2a7bc59..bd4e32a 100644
--- a/src/includes/perfmon_k10.h
+++ b/src/includes/perfmon_k10.h
@@ -6,7 +6,7 @@
  *      Description:  Header file of perfmon module for AMD K10
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -219,6 +219,8 @@ int perfmon_finalizeCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
         {
             VERBOSEPRINTREG(cpu_id, reg, 0x0ULL, CLEAR_CTRL);
             CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+            VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].counterRegister, 0x0ULL));
         }
         eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
diff --git a/src/includes/perfmon_k10_counters.h b/src/includes/perfmon_k10_counters.h
index e94e29a..b1a794f 100644
--- a/src/includes/perfmon_k10_counters.h
+++ b/src/includes/perfmon_k10_counters.h
@@ -6,7 +6,7 @@
  *      Description:  AMD K10 performance counter definition. Also used for AMD K8.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_k10_events.txt b/src/includes/perfmon_k10_events.txt
index d45d790..ab56f1d 100644
--- a/src/includes/perfmon_k10_events.txt
+++ b/src/includes/perfmon_k10_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for AMD K10
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_k8.h b/src/includes/perfmon_k8.h
index 513929b..dd55c32 100644
--- a/src/includes/perfmon_k8.h
+++ b/src/includes/perfmon_k8.h
@@ -7,7 +7,7 @@
  *                    The setup routines and registers are similar to AMD K10
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_k8_events.txt b/src/includes/perfmon_k8_events.txt
index 48d0614..d71316a 100644
--- a/src/includes/perfmon_k8_events.txt
+++ b/src/includes/perfmon_k8_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for AMD K8
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_kabini.h b/src/includes/perfmon_kabini.h
index 323e713..dec1436 100644
--- a/src/includes/perfmon_kabini.h
+++ b/src/includes/perfmon_kabini.h
@@ -6,7 +6,7 @@
  *      Description:  Header file of perfmon module for AMD Family 16
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -353,6 +353,8 @@ int perfmon_finalizeCountersThread_kabini(int thread_id, PerfmonEventSet* eventS
         {
             VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, 0x0ULL, CLEAR_CTRL);
             CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, 0x0ULL));
+            VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].counterRegister, 0x0ULL));
             eventSet->events[i].threadCounter[thread_id].init = FALSE;
         }
     }
diff --git a/src/includes/perfmon_kabini_counters.h b/src/includes/perfmon_kabini_counters.h
index e303341..dd15fbb 100644
--- a/src/includes/perfmon_kabini_counters.h
+++ b/src/includes/perfmon_kabini_counters.h
@@ -6,7 +6,7 @@
  *      Description:  Counter Header File of perfmon module for AMD Family 16
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_kabini_events.txt b/src/includes/perfmon_kabini_events.txt
index a1bac4f..bfa15f2 100644
--- a/src/includes/perfmon_kabini_events.txt
+++ b/src/includes/perfmon_kabini_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for AMD Kabini
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   saravanan.ekanathan at amd.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_nehalem.h b/src/includes/perfmon_nehalem.h
index 6f23bd0..772f9e4 100644
--- a/src/includes/perfmon_nehalem.h
+++ b/src/includes/perfmon_nehalem.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of perfmon module for Intel Nehalem.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -555,51 +555,55 @@ int perfmon_finalizeCountersThread_nehalem(int thread_id, PerfmonEventSet* event
 
     for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            RegisterType type = eventSet->events[i].type;
-            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
-            {
-                continue;
-            }
-            RegisterIndex index = eventSet->events[i].index;
-            uint64_t reg = counter_map[index].configRegister;
-            PciDeviceIndex dev = counter_map[index].device;
-            switch (type)
-            {
-                case PMC:
-                    ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
-                    if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
-                    {
-                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
-                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
-                    }
-                    else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB) &&
-                             ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
-                    {
-                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
-                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
-                    }
-                    else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0x35) &&
-                             ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
-                    {
-                        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL, CLEAR_UNCORE_MATCH);
-                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL));
-                    }
-                    break;
-                case FIXED:
-                    ovf_values_core |= (1ULL<<(index+32));
-                    break;
-                default:
-                    break;
-            }
-            if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        uint64_t reg = counter_map[index].configRegister;
+        PciDeviceIndex dev = counter_map[index].device;
+        switch (type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                }
+                else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB) &&
+                         ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                }
+                else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0x35) &&
+                         ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL, CLEAR_UNCORE_MATCH);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL));
+                }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            default:
+                break;
+        }
+        if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+        {
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
+            if (counter_map[index].counterRegister2 != 0x0)
             {
-                VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
-                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL));
             }
-            eventSet->events[i].threadCounter[thread_id].init = FALSE;
         }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
 
     if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
@@ -616,6 +620,24 @@ int perfmon_finalizeCountersThread_nehalem(int thread_id, PerfmonEventSet* event
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
         VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_UNCORE_CTRL);
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+        for (int i=UNCORE;i<NUM_UNITS;i++)
+        {
+            if ((eventSet->regTypeMask & (REG_TYPE_MASK(i))) && box_map[i].ctrlRegister != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL, CLEAR_UNCORE_BOX_CTRL);
+                HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+                if (box_map[i].filterRegister1)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL);
+                }
+                if (box_map[i].filterRegister2)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL);
+                }
+            }
+        }
     }
     return 0;
 }
diff --git a/src/includes/perfmon_nehalemEX.h b/src/includes/perfmon_nehalemEX.h
index b093ba9..f50c8ec 100644
--- a/src/includes/perfmon_nehalemEX.h
+++ b/src/includes/perfmon_nehalemEX.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of perfmon module for Intel Nehalem EX.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -1254,6 +1254,13 @@ int perfmon_finalizeCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eve
         {
             VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
             CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
+            if (counter_map[index].counterRegister2 != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL));
+            }
         }
         eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
@@ -1272,6 +1279,24 @@ int perfmon_finalizeCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eve
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL));
         VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL, CLEAR_UNCORE_CTRL);
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+        for (int i=UNCORE;i<NUM_UNITS;i++)
+        {
+            if ((eventSet->regTypeMask & (REG_TYPE_MASK(i))) && box_map[i].ctrlRegister != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL, CLEAR_UNCORE_BOX_CTRL);
+                HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+                if (box_map[i].filterRegister1)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL);
+                }
+                if (box_map[i].filterRegister2)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL);
+                }
+            }
+        }
     }
     return 0;
 }
diff --git a/src/includes/perfmon_nehalemEX_counters.h b/src/includes/perfmon_nehalemEX_counters.h
index d40da5c..137c414 100644
--- a/src/includes/perfmon_nehalemEX_counters.h
+++ b/src/includes/perfmon_nehalemEX_counters.h
@@ -6,7 +6,7 @@
  *      Description: Counter Header File of perfmon module for Intel Westmere EX.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_nehalemEX_events.txt b/src/includes/perfmon_nehalemEX_events.txt
index 1c4cf31..62a3f02 100644
--- a/src/includes/perfmon_nehalemEX_events.txt
+++ b/src/includes/perfmon_nehalemEX_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Nehalem EX
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_nehalemEX_westmereEX_common.h b/src/includes/perfmon_nehalemEX_westmereEX_common.h
index 655d5c0..a2d0ebb 100644
--- a/src/includes/perfmon_nehalemEX_westmereEX_common.h
+++ b/src/includes/perfmon_nehalemEX_westmereEX_common.h
@@ -1,3 +1,33 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_nehalemEX_westmereEX_common.h
+ *
+ *      Description:  Common definitions for Intel Nehalem EX and Westmere EX
+ *
+ *      Version:   4.1
+ *      Released:  8.8.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
 #ifndef PERFMON_NEX_WEX_COMMON
 #define PERFMON_NEX_WEX_COMMON
 
diff --git a/src/includes/perfmon_nehalem_counters.h b/src/includes/perfmon_nehalem_counters.h
index 55d0d88..332b46a 100644
--- a/src/includes/perfmon_nehalem_counters.h
+++ b/src/includes/perfmon_nehalem_counters.h
@@ -6,7 +6,7 @@
  *      Description:  Counter Header File of perfmon module for Intel Nehalem.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_nehalem_events.txt b/src/includes/perfmon_nehalem_events.txt
index a17b55e..48c9b41 100644
--- a/src/includes/perfmon_nehalem_events.txt
+++ b/src/includes/perfmon_nehalem_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Nehalem
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_p6_events.txt b/src/includes/perfmon_p6_events.txt
index 9ad1cbc..e8cdda9 100644
--- a/src/includes/perfmon_p6_events.txt
+++ b/src/includes/perfmon_p6_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Pentium 3
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_perf.h b/src/includes/perfmon_perf.h
index 8927d51..a21aaad 100644
--- a/src/includes/perfmon_perf.h
+++ b/src/includes/perfmon_perf.h
@@ -1,13 +1,13 @@
 /*
  * =======================================================================================
  *
- *      Filename:  perfmon_ivybridgeEP_counters.h
+ *      Filename:  perfmon_perf.h
  *
  *      Description: Header file of example perfmon module for software events using
  *                   the perf_event interface
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_phi.h b/src/includes/perfmon_phi.h
index ecf31bb..9fde8cf 100644
--- a/src/includes/perfmon_phi.h
+++ b/src/includes/perfmon_phi.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of perfmon module for Intel Xeon Phi.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -228,6 +228,8 @@ int perfmon_finalizeCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
         RegisterIndex index = eventSet->events[i].index;
         ovf_values_core |= (1ULL<<(index));
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[i].configRegister, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[i].counterRegister, 0x0ULL));
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
     CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
     CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, 0x0ULL));
diff --git a/src/includes/perfmon_phi_counters.h b/src/includes/perfmon_phi_counters.h
index 5bd8010..43523d4 100644
--- a/src/includes/perfmon_phi_counters.h
+++ b/src/includes/perfmon_phi_counters.h
@@ -6,7 +6,7 @@
  *      Description: Counter Header File of perfmon module for Intel Xeon Phi.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_phi_events.txt b/src/includes/perfmon_phi_events.txt
index 1c5434e..4b280c7 100644
--- a/src/includes/perfmon_phi_events.txt
+++ b/src/includes/perfmon_phi_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Xeon Phi
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_pm.h b/src/includes/perfmon_pm.h
index 73beaf2..5992733 100644
--- a/src/includes/perfmon_pm.h
+++ b/src/includes/perfmon_pm.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of perfmon module Pentium M.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -224,6 +224,8 @@ int perfmon_finalizeCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
         {
             CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
             VERBOSEPRINTPCIREG(cpu_id, MSR_DEV, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].counterRegister, 0x0ULL));
+            VERBOSEPRINTPCIREG(cpu_id, MSR_DEV, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
         }
         eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
diff --git a/src/includes/perfmon_pm_counters.h b/src/includes/perfmon_pm_counters.h
index 7e0d6da..3e2d6ec 100644
--- a/src/includes/perfmon_pm_counters.h
+++ b/src/includes/perfmon_pm_counters.h
@@ -6,7 +6,7 @@
  *      Description: Counter Header File of perfmon module for Intel Pentium M.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_pm_events.txt b/src/includes/perfmon_pm_events.txt
index 45fd7f4..c4670e0 100644
--- a/src/includes/perfmon_pm_events.txt
+++ b/src/includes/perfmon_pm_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Pentium M
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_sandybridge.h b/src/includes/perfmon_sandybridge.h
index f6f9665..74b017e 100644
--- a/src/includes/perfmon_sandybridge.h
+++ b/src/includes/perfmon_sandybridge.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of perfmon module for Intel Sandy Bridge.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -45,6 +45,7 @@ static int perfmon_numArchEventsSandybridge = NUM_ARCH_EVENTS_SANDYBRIDGE;
 
 int snb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
 int snbep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int sandy_cbox_nosetup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
 int (*sandy_cbox_setup)(int, RegisterIndex, PerfmonEvent*);
 
 int perfmon_init_sandybridge(int cpu_id)
@@ -66,7 +67,11 @@ int perfmon_init_sandybridge(int cpu_id)
     {
         sandy_cbox_setup = snb_cbox_setup;
     }
-    
+    else
+    {
+        sandy_cbox_setup = sandy_cbox_nosetup;
+    }
+
     return 0;
 }
 
@@ -212,6 +217,11 @@ int snb_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 }
 
 
+int sandy_cbox_nosetup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    return 0;
+}
+
 uint32_t snb_cbox_filter(PerfmonEvent *event)
 {
     int j;
@@ -877,7 +887,7 @@ int perfmon_setupCounterThread_sandybridge(
             case UBOX:
                 snb_ubox_setup(cpu_id, index, event);
                 break;
-                
+
             case UBOXFIX:
                 if (cpuid_info.model == SANDYBRIDGE_EP)
                 {
@@ -924,7 +934,15 @@ int perfmon_setupCounterThread_sandybridge(
                 break;
         }
     }
-    
+    for (int i=UNCORE;i<NUM_UNITS;i++)
+    {
+        if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(i))) && box_map[i].ctrlRegister != 0x0)
+        {
+            VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL, CLEAR_UNCORE_BOX_CTRL);
+            HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+        }
+    }
+
     if (fixed_flags > 0x0)
     {
         VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
@@ -1397,6 +1415,7 @@ int perfmon_stopCountersThread_sandybridge(int thread_id, PerfmonEventSet* event
                         {
                             counter_result = extractBitField(counter_result, 1, 4);
                         }
+                        eventSet->events[i].threadCounter[thread_id].startData = 0;
                         VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, STOP_SBOXFIX);
                     }
                     break;
@@ -1699,7 +1718,7 @@ int perfmon_readCountersThread_sandybridge(int thread_id, PerfmonEventSet* event
 
                 case SBOX0FIX:
                 case SBOX1FIX:
-                    
+
                     HPMread(cpu_id, dev, counter1, &counter_result);
                     if (eventSet->events[i].event.eventId == 0x00)
                     {
@@ -1834,48 +1853,73 @@ int perfmon_finalizeCountersThread_sandybridge(int thread_id, PerfmonEventSet* e
 
     for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            RegisterType type = eventSet->events[i].type;
-            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
-            {
-                continue;
-            }
-            RegisterIndex index = eventSet->events[i].index;
-            PciDeviceIndex dev = counter_map[index].device;
-            uint64_t reg = counter_map[index].configRegister;
-            switch(type)
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PciDeviceIndex dev = counter_map[index].device;
+        uint64_t reg = counter_map[index].configRegister;
+        switch(type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                }
+                else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            default:
+                break;
+        }
+        if ((reg) &&
+            (((type == PMC)||(type == FIXED)) || ((type >= UNCORE) && (haveLock) && (HPMcheck(dev, cpu_id)))))
+        {
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+            CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
+            if (counter_map[index].counterRegister2 != 0x0)
             {
-                case PMC:
-                    ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
-                    if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
-                    {
-                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
-                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
-                    }
-                    else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
-                    {
-                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
-                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
-                    }
-                    break;
-                case FIXED:
-                    ovf_values_core |= (1ULL<<(index+32));
-                    break;
-                default:
-                    break;
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL));
             }
-            if ((reg) &&
-                (((type == PMC)||(type == FIXED)) || ((type >= UNCORE) && (haveLock) && (HPMcheck(dev, cpu_id)))))
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+    }
+
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+    {
+        for (int i=UNCORE;i<NUM_UNITS;i++)
+        {
+            if ((eventSet->regTypeMask & (REG_TYPE_MASK(i))) && box_map[i].ctrlRegister != 0x0)
             {
-                VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
-                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+                VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL, CLEAR_UNCORE_BOX_CTRL);
+                HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+                if (box_map[i].filterRegister1 != 0x0)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL);
+                }
+                if (box_map[i].filterRegister2 != 0x0)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL);
+                }
             }
-            eventSet->events[i].threadCounter[thread_id].init = FALSE;
         }
     }
 
-
     if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
         VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
diff --git a/src/includes/perfmon_sandybridgeEP_counters.h b/src/includes/perfmon_sandybridgeEP_counters.h
index befef53..5b634ec 100644
--- a/src/includes/perfmon_sandybridgeEP_counters.h
+++ b/src/includes/perfmon_sandybridgeEP_counters.h
@@ -6,7 +6,7 @@
  *      Description: Counter header file of perfmon module for Intel Sandy Bridge EP.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -188,8 +188,8 @@ static BoxMap sandybridgeEP_box_map[NUM_UNITS] = {
     [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, 0, 0, 0, 1, PCI_HA_DEVICE_0, 48},
     [SBOX0] = {PCI_UNC_QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_QPI_DEVICE_PORT_0, 48},
     [SBOX1] = {PCI_UNC_QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_QPI_DEVICE_PORT_1, 48},
-    [SBOX0FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_0, 32},
-    [SBOX1FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_1, 32},
+    [SBOX0FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_0, 64},
+    [SBOX1FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_1, 64},
     [RBOX0] = {PCI_UNC_R3QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_R3QPI_DEVICE_LINK_0, 44},
     [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_R3QPI_DEVICE_LINK_1, 44},
     [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, 0, 0, 0, 1, PCI_R2PCIE_DEVICE, 44},
diff --git a/src/includes/perfmon_sandybridgeEP_events.txt b/src/includes/perfmon_sandybridgeEP_events.txt
index 63198a9..1ccccbb 100644
--- a/src/includes/perfmon_sandybridgeEP_events.txt
+++ b/src/includes/perfmon_sandybridgeEP_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel SandyBridge EP
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -255,6 +255,10 @@ UMASK_BR_MISP_EXEC_ALL_BRANCHES                        0xFF
 
 EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
 UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS      0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP EVENT_OPTION_THRESHOLD=0x3
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP    0x01
 
 EVENT_UOPS_DISPATCHED_PORT                  0xA1   PMC
 UMASK_UOPS_DISPATCHED_PORT_PORT_0           0x01
@@ -372,7 +376,7 @@ DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTIO
 UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_ALL              0x01
-DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_USED_CYCLES      0x01
@@ -382,6 +386,9 @@ DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_
 UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES     0x01
 
 EVENT_MACHINE_CLEARS                    0xC3  PMC
+UMASK_MACHINE_CLEARS_CYCLES             0x01
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT    EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT              0x01
 UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
 UMASK_MACHINE_CLEARS_SMC                0x04
 UMASK_MACHINE_CLEARS_MASKMOV            0x20
diff --git a/src/includes/perfmon_sandybridge_counters.h b/src/includes/perfmon_sandybridge_counters.h
index e8dca5b..7dd83d4 100644
--- a/src/includes/perfmon_sandybridge_counters.h
+++ b/src/includes/perfmon_sandybridge_counters.h
@@ -6,7 +6,7 @@
  *      Description: Counter header file of perfmon module for Intel Sandy Bridge.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_sandybridge_events.txt b/src/includes/perfmon_sandybridge_events.txt
index 8bab52b..8cb10dd 100644
--- a/src/includes/perfmon_sandybridge_events.txt
+++ b/src/includes/perfmon_sandybridge_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel SandyBridge
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_silvermont.h b/src/includes/perfmon_silvermont.h
index 980d528..3512b99 100644
--- a/src/includes/perfmon_silvermont.h
+++ b/src/includes/perfmon_silvermont.h
@@ -6,7 +6,7 @@
  *      Description:  Header file of perfmon module for Intel Atom (Silvermont)
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
@@ -510,6 +510,13 @@ int perfmon_finalizeCountersThread_silvermont(int thread_id, PerfmonEventSet* ev
         {
             VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
             CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
+            if (counter_map[index].counterRegister2 != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL));
+            }
         }
         eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
diff --git a/src/includes/perfmon_silvermont_counters.h b/src/includes/perfmon_silvermont_counters.h
index f04c87b..6428d7f 100644
--- a/src/includes/perfmon_silvermont_counters.h
+++ b/src/includes/perfmon_silvermont_counters.h
@@ -6,7 +6,7 @@
  *      Description: Counter header file of perfmon module for Intel Atom (Silvermont)
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/includes/perfmon_silvermont_events.txt b/src/includes/perfmon_silvermont_events.txt
index 5b2d1a7..a8a9bea 100644
--- a/src/includes/perfmon_silvermont_events.txt
+++ b/src/includes/perfmon_silvermont_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Atom (Silvermont)
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
diff --git a/src/includes/perfmon_skylake.h b/src/includes/perfmon_skylake.h
index 1a10dc4..28363c5 100644
--- a/src/includes/perfmon_skylake.h
+++ b/src/includes/perfmon_skylake.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of perfmon module for Intel Skylake.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -154,6 +154,47 @@ int skl_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
     return 0;
 }
 
+int skl_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0x1FULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_UBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
 int skl_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
     int j;
@@ -253,13 +294,7 @@ int perfmon_setupCounterThread_skylake(
                 }
                 break;
             case UBOX:
-                if (haveLock)
-                {
-                    uint64_t uflags = 0x0ULL;
-                    uflags |= (1ULL<<20)|(1ULL<<22);
-                    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, uflags, CLEAR_UBOX)
-                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, uflags));
-                }
+                skl_ubox_setup(cpu_id, index, event);
                 break;
             case CBOX0:
             case CBOX1:
@@ -517,7 +552,6 @@ int perfmon_stopCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
                     break;
             }
         }
-        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
     if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
     {
@@ -557,9 +591,9 @@ int perfmon_readCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
     if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
     {
         CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, &uflags));
-        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, LLU_CAST uflags, SAFE_UBOXFIX_FLAGS)
+        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, LLU_CAST uflags, SAFE_UNCORE_FLAGS)
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL));
-        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL, RESET_UBOXFIX_FLAGS)
+        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL, RESET_UNCORE_FLAGS)
     }
 
     for (int i=0;i < eventSet->numberOfEvents;i++)
@@ -637,6 +671,7 @@ int perfmon_readCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
                         CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
                         SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
                         *current = field64(counter_result, 0, box_map[type].regWidth);
+                        uflags |= (1ULL<<(type-CBOX0));
                     }
                     break;
 
@@ -652,8 +687,8 @@ int perfmon_readCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
         {
             CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, counter_result));
         }
-        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags));
-        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags, RESET_UBOXFIX_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags|(1ULL<<29)));
+        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags|(1ULL<<29), RESTORE_UNCORE_FLAGS)
     }
 
     if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
@@ -711,13 +746,6 @@ int perfmon_finalizeCountersThread_skylake(int thread_id, PerfmonEventSet* event
                 ovf_values_core |= (1ULL<<(index+32));
                 break;
             default:
-                /*if (counter_map[index].type > UBOXFIX)
-                {
-                    if (box_map[counter_map[index].type].ovflOffset >= 0)
-                    {
-                        ovf_values_UBOXFIX |= (1ULL<<box_map[counter_map[index].type].ovflOffset);
-                    }
-                }*/
                 break;
         }
         if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UBOXFIX) && (haveLock))))
@@ -731,15 +759,40 @@ int perfmon_finalizeCountersThread_skylake(int thread_id, PerfmonEventSet* event
             {
                 CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
             }
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
+            if (counter_map[index].counterRegister2 != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL));
+            }
         }
         eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
     if (haveLock && eventSet->regTypeMask & ~(0xFULL))
     {
-        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values_UBOXFIX, CLEAR_UBOXFIX_OVF)
-        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, ovf_values_UBOXFIX));
-        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UBOXFIX_CTRL)
-        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_STATUS, LLU_CAST 0x0ULL, CLEAR_UNCORE_STATUS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_UNCORE_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL));
+        for (int i=UNCORE;i<NUM_UNITS;i++)
+        {
+            if ((eventSet->regTypeMask & (REG_TYPE_MASK(i))) && box_map[i].ctrlRegister != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL, CLEAR_UNCORE_BOX_CTRL);
+                HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+                if (box_map[i].filterRegister1)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL);
+                }
+                if (box_map[i].filterRegister2)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL);
+                }
+            }
+        }
     }
 
     if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
diff --git a/src/includes/perfmon_skylake_counters.h b/src/includes/perfmon_skylake_counters.h
index 9b0e2c7..2c2630d 100644
--- a/src/includes/perfmon_skylake_counters.h
+++ b/src/includes/perfmon_skylake_counters.h
@@ -6,7 +6,7 @@
  *      Description:  Counter Header File of perfmon module for Intel Skylake.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -36,17 +36,19 @@
 #define SKL_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
 #define SKL_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
             EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define SKL_VALID_OPTIONS_CBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define SKL_VALID_OPTIONS_UBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
 
 static RegisterMap skylake_counter_map[NUM_COUNTERS_SKYLAKE] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, HAS_VALID_OPTIONS_FIXED},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, HAS_VALID_OPTIONS_FIXED},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, HAS_VALID_OPTIONS_FIXED},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, SKL_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, SKL_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, SKL_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, HAS_VALID_OPTIONS_PMC},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, HAS_VALID_OPTIONS_PMC},
-    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, HAS_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
-    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, HAS_VALID_OPTIONS_PMC},
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, SKL_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, SKL_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, SKL_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, SKL_VALID_OPTIONS_PMC},
     /* Temperature Sensor*/
     {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
     /* RAPL counters */
@@ -57,16 +59,16 @@ static RegisterMap skylake_counter_map[NUM_COUNTERS_SKYLAKE] = {
     {"PWR4", PMC12, POWER, 0, MSR_PLATFORM_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
     /* Test */
     {"UBOXFIX", PMC13, UBOXFIX, MSR_UNC_PERF_FIXED_CTRL, MSR_UNC_PERF_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
-    {"UBOX0", PMC14, UBOX, MSR_V4_ARB_PERF_FIXED_CTRL0, MSR_V4_ARB_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
-    {"UBOX1", PMC15, UBOX, MSR_V4_ARB_PERF_FIXED_CTRL1, MSR_V4_ARB_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
-    {"CBOX0C0", PMC16, CBOX0, MSR_V4_C0_PERF_FIXED_CTRL0, MSR_V4_C0_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
-    {"CBOX0C1", PMC17, CBOX0, MSR_V4_C0_PERF_FIXED_CTRL1, MSR_V4_C0_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
-    {"CBOX1C0", PMC18, CBOX1, MSR_V4_C1_PERF_FIXED_CTRL0, MSR_V4_C1_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
-    {"CBOX1C1", PMC19, CBOX1, MSR_V4_C1_PERF_FIXED_CTRL1, MSR_V4_C1_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
-    {"CBOX2C0", PMC20, CBOX2, MSR_V4_C2_PERF_FIXED_CTRL0, MSR_V4_C2_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
-    {"CBOX2C1", PMC21, CBOX2, MSR_V4_C2_PERF_FIXED_CTRL1, MSR_V4_C2_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
-    {"CBOX3C0", PMC22, CBOX3, MSR_V4_C3_PERF_FIXED_CTRL0, MSR_V4_C3_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
-    {"CBOX3C1", PMC23, CBOX3, MSR_V4_C3_PERF_FIXED_CTRL1, MSR_V4_C3_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"UBOX0", PMC14, UBOX, MSR_V4_ARB_PERF_FIXED_CTRL0, MSR_V4_ARB_PERF_FIXED_CTR0, 0, 0, SKL_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC15, UBOX, MSR_V4_ARB_PERF_FIXED_CTRL1, MSR_V4_ARB_PERF_FIXED_CTR1, 0, 0, SKL_VALID_OPTIONS_UBOX},
+    {"CBOX0C0", PMC16, CBOX0, MSR_V4_C0_PERF_FIXED_CTRL0, MSR_V4_C0_PERF_FIXED_CTR0, 0, 0, SKL_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC17, CBOX0, MSR_V4_C0_PERF_FIXED_CTRL1, MSR_V4_C0_PERF_FIXED_CTR1, 0, 0, SKL_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC18, CBOX1, MSR_V4_C1_PERF_FIXED_CTRL0, MSR_V4_C1_PERF_FIXED_CTR0, 0, 0, SKL_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC19, CBOX1, MSR_V4_C1_PERF_FIXED_CTRL1, MSR_V4_C1_PERF_FIXED_CTR1, 0, 0, SKL_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC20, CBOX2, MSR_V4_C2_PERF_FIXED_CTRL0, MSR_V4_C2_PERF_FIXED_CTR0, 0, 0, SKL_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC21, CBOX2, MSR_V4_C2_PERF_FIXED_CTRL1, MSR_V4_C2_PERF_FIXED_CTR1, 0, 0, SKL_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC22, CBOX3, MSR_V4_C3_PERF_FIXED_CTRL0, MSR_V4_C3_PERF_FIXED_CTR0, 0, 0, SKL_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC23, CBOX3, MSR_V4_C3_PERF_FIXED_CTRL1, MSR_V4_C3_PERF_FIXED_CTR1, 0, 0, SKL_VALID_OPTIONS_CBOX},
 };
 
 
diff --git a/src/includes/perfmon_skylake_events.txt b/src/includes/perfmon_skylake_events.txt
index 9ce3b9a..35aede4 100644
--- a/src/includes/perfmon_skylake_events.txt
+++ b/src/includes/perfmon_skylake_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Skylake
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_types.h b/src/includes/perfmon_types.h
index c93874e..e4e427b 100644
--- a/src/includes/perfmon_types.h
+++ b/src/includes/perfmon_types.h
@@ -8,7 +8,7 @@
  *                    on x86 based architectures. Supports multi threading.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_westmere.h b/src/includes/perfmon_westmere.h
index 056a2a7..e6c2390 100644
--- a/src/includes/perfmon_westmere.h
+++ b/src/includes/perfmon_westmere.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of perfmon module for Intel Westmere.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_westmereEX.h b/src/includes/perfmon_westmereEX.h
index a0c52ac..bdf577d 100644
--- a/src/includes/perfmon_westmereEX.h
+++ b/src/includes/perfmon_westmereEX.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of perfmon module for Intel Westmere EX.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -1315,16 +1315,19 @@ int perfmon_finalizeCountersThread_westmereEX(int thread_id, PerfmonEventSet* ev
                 ovf_values_core |= (1ULL<<(index+32));
                 break;
             default:
-                if (((haveLock) && (type > UNCORE)))
-                {
-                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
-                }
                 break;
         }
         if ((reg) && (((dev == MSR_DEV) && (type < UNCORE)) || (((haveLock) && (type > UNCORE)))))
         {
             VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
             CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
+            if (counter_map[index].counterRegister2 != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL));
+            }
         }
         eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
@@ -1341,6 +1344,24 @@ int perfmon_finalizeCountersThread_westmereEX(int thread_id, PerfmonEventSet* ev
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
         VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL, CLEAR_UNCORE_OVERFLOW);
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL));
+        for (int i=UNCORE;i<NUM_UNITS;i++)
+        {
+            if ((eventSet->regTypeMask & (REG_TYPE_MASK(i))) && box_map[i].ctrlRegister != 0x0)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL, CLEAR_UNCORE_BOX_CTRL);
+                HPMwrite(cpu_id, box_map[i].device, box_map[i].ctrlRegister, 0x0ULL);
+                if (box_map[i].filterRegister1)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister1, 0x0ULL);
+                }
+                if (box_map[i].filterRegister2)
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL, CLEAR_FILTER);
+                    HPMwrite(cpu_id, box_map[i].device, box_map[i].filterRegister2, 0x0ULL);
+                }
+            }
+        }
     }
     return 0;
 }
diff --git a/src/includes/perfmon_westmereEX_counters.h b/src/includes/perfmon_westmereEX_counters.h
index 85e4c6d..af9ab15 100644
--- a/src/includes/perfmon_westmereEX_counters.h
+++ b/src/includes/perfmon_westmereEX_counters.h
@@ -6,7 +6,7 @@
  *      Description: Counter Header File of perfmon module for Westmere EX.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_westmereEX_events.txt b/src/includes/perfmon_westmereEX_events.txt
index 014dfa6..4b68ecb 100644
--- a/src/includes/perfmon_westmereEX_events.txt
+++ b/src/includes/perfmon_westmereEX_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel WestmereEX
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/perfmon_westmere_events.txt b/src/includes/perfmon_westmere_events.txt
index 7032ae3..add7948 100644
--- a/src/includes/perfmon_westmere_events.txt
+++ b/src/includes/perfmon_westmere_events.txt
@@ -5,7 +5,7 @@
 #      Description:  Event list for Intel Westmere
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/power.h b/src/includes/power.h
index b6c26d8..abe6fe7 100644
--- a/src/includes/power.h
+++ b/src/includes/power.h
@@ -7,7 +7,7 @@
  *                    Implements Intel RAPL Interface.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/power_types.h b/src/includes/power_types.h
index 337e091..09dff96 100644
--- a/src/includes/power_types.h
+++ b/src/includes/power_types.h
@@ -6,7 +6,7 @@
  *      Description:  Types file for power module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/registers.h b/src/includes/registers.h
index 32d975e..bc81c79 100644
--- a/src/includes/registers.h
+++ b/src/includes/registers.h
@@ -6,7 +6,7 @@
  *      Description:  Register Defines for the perfmon module
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/registers_types.h b/src/includes/registers_types.h
index e588e3e..0859ff8 100644
--- a/src/includes/registers_types.h
+++ b/src/includes/registers_types.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of registers.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -181,7 +181,11 @@ static char* RegisterTypeNames[MAX_UNITS] = {
     [NOTYPE] = "No Type, used for skipping unavailable counters"
 };
 
-#define REG_TYPE_MASK(type) (type < NUM_UNITS ? (0x1ULL<<type) : 0x0ULL)
+#ifdef __x86_64
+#define REG_TYPE_MASK(type) (type < NUM_UNITS ? (((__uint128_t)1ULL)<<type) : (((__uint128_t)0ULL)<<64|0ULL))
+#else
+#define REG_TYPE_MASK(type) (type < NUM_UNITS ? (1ULL<<type) : (0x0ULL)
+#endif
 
 typedef struct {
     char*               key;
diff --git a/src/includes/textcolor.h b/src/includes/textcolor.h
index d0a3e10..bd9b919 100644
--- a/src/includes/textcolor.h
+++ b/src/includes/textcolor.h
@@ -8,7 +8,7 @@
  *                    colored text.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/thermal.h b/src/includes/thermal.h
index ac37261..1b5e400 100644
--- a/src/includes/thermal.h
+++ b/src/includes/thermal.h
@@ -7,7 +7,7 @@
  *                    Implements Intel TM/TM2 Interface.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/thermal_types.h b/src/includes/thermal_types.h
index feb17fa..0fb0791 100644
--- a/src/includes/thermal_types.h
+++ b/src/includes/thermal_types.h
@@ -6,7 +6,7 @@
  *      Description:  Types file for thermal module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/timer.h b/src/includes/timer.h
index a7ea870..8673630 100644
--- a/src/includes/timer.h
+++ b/src/includes/timer.h
@@ -11,7 +11,7 @@
  *      measurements should be over 1000 cycles.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/timer_types.h b/src/includes/timer_types.h
index 2dac362..5e972cc 100644
--- a/src/includes/timer_types.h
+++ b/src/includes/timer_types.h
@@ -6,7 +6,7 @@
  *      Description:  Types file for timer module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/tlb-info.h b/src/includes/tlb-info.h
index 1f322c9..ca6d65e 100644
--- a/src/includes/tlb-info.h
+++ b/src/includes/tlb-info.h
@@ -7,7 +7,7 @@
  *                    describing strings. Not used currently.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/includes/topology.h b/src/includes/topology.h
index 77129fb..05ff5ed 100644
--- a/src/includes/topology.h
+++ b/src/includes/topology.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of topology module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/topology_cpuid.h b/src/includes/topology_cpuid.h
index 9e39641..5f5a8bd 100644
--- a/src/includes/topology_cpuid.h
+++ b/src/includes/topology_cpuid.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of topology backend using cpuid instruction.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/topology_hwloc.h b/src/includes/topology_hwloc.h
index 4595a08..bd990d5 100644
--- a/src/includes/topology_hwloc.h
+++ b/src/includes/topology_hwloc.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of topology backend using the hwloc library
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/includes/topology_proc.h b/src/includes/topology_proc.h
index 1efd81b..2b0a13b 100644
--- a/src/includes/topology_proc.h
+++ b/src/includes/topology_proc.h
@@ -6,7 +6,7 @@
  *      Description:  Header File of topology backend using procfs/sysfs
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/includes/topology_types.h b/src/includes/topology_types.h
index 82cf954..a9b0d96 100644
--- a/src/includes/topology_types.h
+++ b/src/includes/topology_types.h
@@ -7,7 +7,7 @@
  *                    in likwid.h
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com,
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/includes/tree.h b/src/includes/tree.h
index 66cfa97..f4b5529 100644
--- a/src/includes/tree.h
+++ b/src/includes/tree.h
@@ -7,7 +7,7 @@
  *                    Implements a simple tree data structure.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/tree_types.h b/src/includes/tree_types.h
index d2eb7d5..dac1a4b 100644
--- a/src/includes/tree_types.h
+++ b/src/includes/tree_types.h
@@ -6,7 +6,7 @@
  *      Description:  Types file for tree module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/includes/types.h b/src/includes/types.h
index c32d870..45df468 100644
--- a/src/includes/types.h
+++ b/src/includes/types.h
@@ -6,7 +6,7 @@
  *      Description:  Global  Types file
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/libperfctr.c b/src/libperfctr.c
index 6f0ff0f..bf4ae39 100644
--- a/src/libperfctr.c
+++ b/src/libperfctr.c
@@ -6,7 +6,7 @@
  *      Description:  Marker API interface of module perfmon
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/likwid.f90 b/src/likwid.f90
index f7096e5..0735f13 100644
--- a/src/likwid.f90
+++ b/src/likwid.f90
@@ -5,7 +5,7 @@
 !     Description: Marker API f90 module
 !
 !      Version:   4.1
-!      Released:  19.5.2016
+!      Released:  8.8.2016
 !
 !     Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
 !               Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/likwid_f90_interface.c b/src/likwid_f90_interface.c
index 51285ec..1fe555b 100644
--- a/src/likwid_f90_interface.c
+++ b/src/likwid_f90_interface.c
@@ -6,7 +6,7 @@
  *      Description: F90 interface for marker API
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com,
  *               Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/luawid.c b/src/luawid.c
index 6e5ced8..791c511 100644
--- a/src/luawid.c
+++ b/src/luawid.c
@@ -6,7 +6,7 @@
  *      Description:  C part of the Likwid Lua interface
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
@@ -1753,10 +1753,9 @@ void parse(char *line, char **argv)
      *argv = '\0';                 /* mark the end of argument list  */
 }
 
-static volatile int program_running = 0;
 
 static void catch_sigchild(int signo) {
-    program_running = 0;
+    ;;
 }
 
 static int lua_likwid_startProgram(lua_State* L)
@@ -1790,7 +1789,6 @@ static int lua_likwid_startProgram(lua_State* L)
     }
     parse(exec, argv);
     ppid = getpid();
-    program_running = 1;
     pid = fork();
     if (pid < 0)
     {
@@ -1819,6 +1817,7 @@ static int lua_likwid_startProgram(lua_State* L)
 }
 static int lua_likwid_checkProgram(lua_State* L)
 {
+    int ret = -1;
     if (lua_gettop(L) == 1)
     {
         int status;
@@ -1826,9 +1825,12 @@ static int lua_likwid_checkProgram(lua_State* L)
         pid_t pid = lua_tonumber(L, 1);
         retpid = waitpid(pid, &status, WNOHANG);
         if (retpid == pid)
-            program_running = 0;
+        {
+            if (WIFEXITED(status))
+                ret = WEXITSTATUS(status);
+        }
     }
-    lua_pushboolean(L, program_running);
+    lua_pushinteger(L, ret);
     return 1;
 }
 
@@ -1836,16 +1838,22 @@ static int lua_likwid_killProgram(lua_State* L)
 {
     pid_t pid = lua_tonumber(L, 1);
     kill(pid, SIGTERM);
-    program_running = 0;
     return 0;
 }
 
-static int lua_likwid_waitwid(lua_State* L)
+static int lua_likwid_waitpid(lua_State* L)
 {
     int status;
+    int ret = -1;
     pid_t pid = lua_tonumber(L, 1);
-    waitpid(pid, &status, 0);
-    return 0;
+    pid_t retpid = waitpid(pid, &status, 0);
+    if (pid == retpid)
+    {
+        if (WIFEXITED(status))
+            ret = WEXITSTATUS(status);
+    }
+    lua_pushinteger(L, ret);
+    return 1;
 }
 
 static int lua_likwid_memSweep(lua_State* L)
@@ -2296,7 +2304,7 @@ int __attribute__ ((visibility ("default") )) luaopen_liblikwid(lua_State* L){
     lua_register(L, "likwid_killProgram", lua_likwid_killProgram);
     lua_register(L, "likwid_catchSignal", lua_likwid_catch_signal);
     lua_register(L, "likwid_getSignalState", lua_likwid_return_signal_state);
-    lua_register(L, "likwid_waitwid", lua_likwid_waitwid);
+    lua_register(L, "likwid_waitpid", lua_likwid_waitpid);
     // Verbosity functions
     lua_register(L, "likwid_setVerbosity", lua_likwid_setVerbosity);
     // Marker API functions
diff --git a/src/memsweep.c b/src/memsweep.c
index 012c000..f9ea77e 100644
--- a/src/memsweep.c
+++ b/src/memsweep.c
@@ -6,7 +6,7 @@
  *      Description:  Implementation of sweeper module.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
@@ -104,7 +104,8 @@ static void cleanupCache(char* ptr)
 {
 #if defined(__x86_64__) || defined(__i386__)
     uint32_t cachesize = 2 * cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].size;
-    printf("Cleaning LLC with %g MB\n", (double)cachesize/(1024.0 * 1024.0));
+    if (getenv("LIKWID_SILENT") == NULL)
+        printf("Cleaning LLC with %g MB\n", (double)cachesize/(1024.0 * 1024.0));
     _loadData(cachesize,ptr);
 #else
     ERROR_PLAIN_PRINT(Cleanup cache is currently only available on X86 systems.);
@@ -136,10 +137,13 @@ memsweep_domain(int domainId)
 {
     char* ptr = NULL;
     size_t size = numa_info.nodes[domainId].totalMemory * 1024ULL * memoryFraction / 100ULL;
-    printf("Sweeping domain %d: Using %g MB of %g MB\n",
-            domainId,
-            size / (1024.0 * 1024.0),
-            numa_info.nodes[domainId].totalMemory/ 1024.0);
+    if (getenv("LIKWID_SILENT") == NULL)
+    {
+        printf("Sweeping domain %d: Using %g MB of %g MB\n",
+                domainId,
+                size / (1024.0 * 1024.0),
+                numa_info.nodes[domainId].totalMemory/ 1024.0);
+    }
     ptr = (char*) allocateOnNode(size, domainId);
     initMemory(size, ptr, domainId);
     cleanupCache(ptr);
diff --git a/src/numa.c b/src/numa.c
index 09459d3..c882508 100644
--- a/src/numa.c
+++ b/src/numa.c
@@ -7,7 +7,7 @@
  *                    procfs/sysfs backends.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/numa_hwloc.c b/src/numa_hwloc.c
index 94639fc..ccd69ad 100644
--- a/src/numa_hwloc.c
+++ b/src/numa_hwloc.c
@@ -6,7 +6,7 @@
  *      Description:  Interface to hwloc for NUMA topology
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/numa_proc.c b/src/numa_proc.c
index a17d824..a7623e7 100644
--- a/src/numa_proc.c
+++ b/src/numa_proc.c
@@ -6,7 +6,7 @@
  *      Description:  Get NUMA topology from procfs and sysfs
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/pci_hwloc.c b/src/pci_hwloc.c
index 217e447..6177e21 100644
--- a/src/pci_hwloc.c
+++ b/src/pci_hwloc.c
@@ -6,7 +6,7 @@
  *      Description:  Interface to hwloc for PCI device lookup
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/pci_proc.c b/src/pci_proc.c
index cee436f..854a526 100644
--- a/src/pci_proc.c
+++ b/src/pci_proc.c
@@ -6,7 +6,7 @@
  *      Description:  Interface to procfs/sysfs for PCI device lookup
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/perfgroup.c b/src/perfgroup.c
index 166790e..aff2e7c 100644
--- a/src/perfgroup.c
+++ b/src/perfgroup.c
@@ -6,7 +6,7 @@
  *      Description:  Handler for performance groups and event sets
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at gmail.com
diff --git a/src/perfmon.c b/src/perfmon.c
index ee4f80f..09b957b 100644
--- a/src/perfmon.c
+++ b/src/perfmon.c
@@ -6,7 +6,7 @@
  *      Description:  Main implementation of the performance monitoring module
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -65,6 +65,7 @@
 #include <perfmon_interlagos.h>
 #include <perfmon_kabini.h>
 #include <perfmon_silvermont.h>
+#include <perfmon_goldmont.h>
 #include <perfmon_broadwell.h>
 #include <perfmon_skylake.h>
 
@@ -653,6 +654,15 @@ perfmon_init_maps(void)
                     perfmon_numCoreCounters = perfmon_numCoreCountersSilvermont;
                     break;
 
+                case ATOM_SILVERMONT_GOLD:
+                    eventHash = goldmont_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsGoldmont;
+                    counter_map = goldmont_counter_map;
+                    box_map = goldmont_box_map;
+                    perfmon_numCounters = perfmon_numCountersGoldmont;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersGoldmont;
+                    break;
+
                 case CORE_DUO:
                     ERROR_PLAIN_PRINT(Unsupported Processor);
                     break;
@@ -905,6 +915,16 @@ perfmon_init_funcs(int* init_power, int* init_temp)
                     perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_silvermont;
                     break;
 
+                case ATOM_SILVERMONT_GOLD:
+                    initialize_power = TRUE;
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_goldmont;
+                    perfmon_startCountersThread = perfmon_startCountersThread_goldmont;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_goldmont;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_goldmont;
+                    perfmon_readCountersThread = perfmon_readCountersThread_goldmont;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_goldmont;
+                    break;
 
                 case CORE_DUO:
                     ERROR_PLAIN_PRINT(Unsupported Processor);
@@ -1365,7 +1385,11 @@ perfmon_addEventSet(char* eventCString)
         return -ENOMEM;
     }
     eventSet->numberOfEvents = 0;
+#ifdef __x86_64
+    eventSet->regTypeMask = ((__uint128_t)0x0ULL<<64)|0x0ULL;
+#else
     eventSet->regTypeMask = 0x0ULL;
+#endif
 
 
     int forceOverwrite = 0;
@@ -1523,7 +1547,7 @@ perfmon_setupCounters(int groupId)
 int
 __perfmon_startCounters(int groupId)
 {
-    int i = 0;
+    int i = 0, j = 0;
     int ret = 0;
     if (groupSet->groups[groupId].state != STATE_SETUP)
     {
@@ -1531,6 +1555,8 @@ __perfmon_startCounters(int groupId)
     }
     for(;i<groupSet->numberOfThreads;i++)
     {
+        for (j=0; j<perfmon_getNumberOfEvents(groupId); j++)
+            groupSet->groups[groupId].events[j].threadCounter[i].overflows = 0;
         ret = perfmon_startCountersThread(groupSet->threads[i].thread_id, &groupSet->groups[groupId]);
         if (ret)
         {
@@ -1608,7 +1634,7 @@ __perfmon_stopCounters(int groupId)
     {
         for (j=0; j<perfmon_getNumberOfThreads(); j++)
         {
-            result = calculateResult(groupId, i, j);
+            result = (double)calculateResult(groupId, i, j);
             groupSet->groups[groupId].events[i].threadCounter[j].lastResult = result;
             groupSet->groups[groupId].events[i].threadCounter[j].fullResult += result;
         }
@@ -1703,9 +1729,15 @@ __perfmon_readCounters(int groupId, int threadId)
             }
             for (j=0; j < groupSet->groups[groupId].numberOfEvents; j++)
             {
-                groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = (double)calculateResult(groupId, j, threadId);
-                groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = result;
-                groupSet->groups[groupId].events[j].threadCounter[threadId].fullResult += result;
+                if (groupSet->groups[groupId].events[j].type != NOTYPE)
+                {
+                    result = (double)calculateResult(groupId, j, threadId);
+                    groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = result;
+                    groupSet->groups[groupId].events[j].threadCounter[threadId].fullResult += result;
+                    groupSet->groups[groupId].events[j].threadCounter[threadId].startData = 
+                        groupSet->groups[groupId].events[j].threadCounter[threadId].counterData;
+                    groupSet->groups[groupId].events[j].threadCounter[threadId].overflows = 0;
+                }
             }
         }
     }
@@ -1718,9 +1750,12 @@ __perfmon_readCounters(int groupId, int threadId)
         }
         for (j=0; j < groupSet->groups[groupId].numberOfEvents; j++)
         {
-            groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = (double)calculateResult(groupId, j, threadId);
+            result = (double)calculateResult(groupId, j, threadId);
             groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = result;
             groupSet->groups[groupId].events[j].threadCounter[threadId].fullResult += result;
+            groupSet->groups[groupId].events[j].threadCounter[threadId].startData =
+                groupSet->groups[groupId].events[j].threadCounter[threadId].counterData;
+            groupSet->groups[groupId].events[j].threadCounter[threadId].overflows = 0;
         }
 }
     timer_start(&groupSet->groups[groupId].timer);
@@ -1798,8 +1833,17 @@ perfmon_getResult(int groupId, int eventId, int threadId)
         printf("ERROR: ThreadID greater than defined threads\n");
         return 0;
     }
+    if (groupSet->groups[groupId].events[eventId].type == NOTYPE)
+        return 0;
 
-    if (groupSet->groups[groupId].events[eventId].threadCounter[threadId].fullResult == 0)
+    if ((groupSet->groups[groupId].events[eventId].threadCounter[threadId].fullResult == 0) ||
+        (groupSet->groups[groupId].events[eventId].type == THERMAL) ||
+        (groupSet->groups[groupId].events[eventId].type == QBOX0FIX) ||
+        (groupSet->groups[groupId].events[eventId].type == QBOX1FIX) ||
+        (groupSet->groups[groupId].events[eventId].type == QBOX2FIX) ||
+        (groupSet->groups[groupId].events[eventId].type == SBOX0FIX) ||
+        (groupSet->groups[groupId].events[eventId].type == SBOX1FIX) ||
+        (groupSet->groups[groupId].events[eventId].type == SBOX2FIX))
     {
         return groupSet->groups[groupId].events[eventId].threadCounter[threadId].lastResult;
     }
@@ -1836,6 +1880,8 @@ perfmon_getLastResult(int groupId, int eventId, int threadId)
         printf("ERROR: ThreadID greater than defined threads\n");
         return 0;
     }
+    if (groupSet->groups[groupId].events[eventId].type == NOTYPE)
+        return 0;
 
     return groupSet->groups[groupId].events[eventId].threadCounter[threadId].lastResult;
 }
@@ -1879,13 +1925,13 @@ perfmon_getMetric(int groupId, int metricId, int threadId)
         add_to_clist(&clist,groupSet->groups[groupId].group.counters[e],
                      perfmon_getResult(groupId, e, threadId));
     }
-    add_to_clist(&clist, "time", perfmon_getLastTimeOfGroup(groupId));
+    add_to_clist(&clist, "time", perfmon_getTimeOfGroup(groupId));
     add_to_clist(&clist, "inverseClock", 1.0/timer_getCycleClock());
     e = calc_metric(groupSet->groups[groupId].group.metricformulas[metricId], &clist, &result);
     if (e < 0)
     {
         result = 0.0;
-        ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[groupId].group.metricformulas[metricId]);
+        //ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[groupId].group.metricformulas[metricId]);
     }
     destroy_clist(&clist);
     return result;
@@ -1935,7 +1981,7 @@ perfmon_getLastMetric(int groupId, int metricId, int threadId)
     if (e < 0)
     {
         result = 0.0;
-        ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[groupId].group.metricformulas[metricId]);
+        //ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[groupId].group.metricformulas[metricId]);
     }
     destroy_clist(&clist);
     return result;
diff --git a/src/perfmon_perf.c b/src/perfmon_perf.c
index 17a56c0..cfc40ac 100644
--- a/src/perfmon_perf.c
+++ b/src/perfmon_perf.c
@@ -7,7 +7,7 @@
  *                    Currently not integrated in perfmon.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
diff --git a/src/power.c b/src/power.c
index d76c965..e33695c 100644
--- a/src/power.c
+++ b/src/power.c
@@ -6,7 +6,7 @@
  *      Description:  Module implementing Intel RAPL interface
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -80,6 +80,8 @@ power_init(int cpuId)
         case ATOM_SILVERMONT_Z1:
         case ATOM_SILVERMONT_Z2:
         case ATOM_SILVERMONT_F:
+        case ATOM_SILVERMONT_AIR:
+        case ATOM_SILVERMONT_GOLD:
         case BROADWELL:
         case BROADWELL_E:
         case BROADWELL_D:
diff --git a/src/pthread-overload/Makefile b/src/pthread-overload/Makefile
index 889d824..eaf6adf 100644
--- a/src/pthread-overload/Makefile
+++ b/src/pthread-overload/Makefile
@@ -5,7 +5,7 @@
 #      Description:  pthread-overload Makefile
 #
 #      Version:   4.1
-#      Released:  19.5.2016
+#      Released:  8.8.2016
 #
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #               Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/pthread-overload/pthread-overload.c b/src/pthread-overload/pthread-overload.c
index f076b08..05e74fc 100644
--- a/src/pthread-overload/pthread-overload.c
+++ b/src/pthread-overload/pthread-overload.c
@@ -7,7 +7,7 @@
  *                    Implements pinning of threads together with likwid-pin.
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/thermal.c b/src/thermal.c
index e5cf7a9..8a46e67 100644
--- a/src/thermal.c
+++ b/src/thermal.c
@@ -6,7 +6,7 @@
  *      Description:  Module implementing Intel TM/TM2 interface
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/timer.c b/src/timer.c
index ce43bba..706158d 100644
--- a/src/timer.c
+++ b/src/timer.c
@@ -6,7 +6,7 @@
  *      Description:  Implementation of timer module
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/src/topology.c b/src/topology.c
index 602abf2..b4ab30b 100644
--- a/src/topology.c
+++ b/src/topology.c
@@ -6,7 +6,7 @@
  *      Description:  Interface to the topology backends
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -1009,6 +1009,7 @@ void print_supportedCPUs (void)
     printf("\t%s\n",broadwell_d_str);
     printf("\t%s\n",broadwell_ep_str);
     printf("\t%s\n",skylake_str);
+    printf("\t%s\n",atom_goldmont_str);
     printf("\n");
     printf("Supported AMD processors:\n");
     printf("\t%s\n",opteron_sc_str);
diff --git a/src/topology_cpuid.c b/src/topology_cpuid.c
index 504714d..5fbcea4 100644
--- a/src/topology_cpuid.c
+++ b/src/topology_cpuid.c
@@ -6,7 +6,7 @@
  *      Description:  Interface to the cpuid based topology backend
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
diff --git a/src/topology_hwloc.c b/src/topology_hwloc.c
index 04c2417..5111faa 100644
--- a/src/topology_hwloc.c
+++ b/src/topology_hwloc.c
@@ -6,7 +6,7 @@
  *      Description:  Interface to the hwloc based topology backend
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Authors:  Thomas Roehl (tr), thomas.roehl at googlemail.com
  *
@@ -126,9 +126,13 @@ void hwloc_init_nodeTopology(cpu_set_t cpuSet)
     int maxNumLogicalProcs;
     int maxNumLogicalProcsPerCore;
     int maxNumCores;
+    int maxNumSockets;
+    int maxNumCoresPerSocket;
     hwloc_obj_t obj;
     int poolsize = 0;
+    int nr_sockets = 1;
     int id = 0;
+    int consecutive_cores = -1;
     hwloc_obj_type_t socket_type = HWLOC_OBJ_SOCKET;
     for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
     {
@@ -153,7 +157,21 @@ void hwloc_init_nodeTopology(cpu_set_t cpuSet)
     {
         socket_type = HWLOC_OBJ_NODE;
     }
-    maxNumLogicalProcsPerCore = maxNumLogicalProcs/maxNumCores;
+    maxNumSockets = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, socket_type);
+    obj = likwid_hwloc_get_obj_by_type(hwloc_topology, socket_type, 0);
+    if (obj)
+    {
+        maxNumCoresPerSocket = likwid_hwloc_record_objs_of_type_below_obj(hwloc_topology, obj, HWLOC_OBJ_CORE, NULL, NULL);
+    }
+    obj = likwid_hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_CORE, 0);
+    if (obj)
+    {
+        maxNumLogicalProcsPerCore = likwid_hwloc_record_objs_of_type_below_obj(hwloc_topology, obj, HWLOC_OBJ_PU, NULL, NULL);
+    }
+    else
+    {
+        maxNumLogicalProcsPerCore = maxNumLogicalProcs/maxNumCores;
+    }
     for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
     {
         int skip = 0;
@@ -163,46 +181,59 @@ void hwloc_init_nodeTopology(cpu_set_t cpuSet)
             continue;
         }
         id = obj->os_index;
-        hwThreadPool[id].inCpuSet = 1;
+        if (CPU_ISSET(id, &cpuSet))
+            hwThreadPool[id].inCpuSet = 1;
         hwThreadPool[id].apicId = obj->os_index;
         hwThreadPool[id].threadId = obj->sibling_rank;
-        while (obj->type != HWLOC_OBJ_CORE) {
-            obj = obj->parent;
-            if (!obj)
+        if (maxNumLogicalProcsPerCore > 1)
+        {
+            while (obj->type != HWLOC_OBJ_CORE) {
+                obj = obj->parent;
+                if (!obj)
+                {
+                    skip = 1;
+                    break;
+                }
+            }
+            if (skip)
             {
-                skip = 1;
-                break;
+                hwThreadPool[id].coreId = 0;
+                hwThreadPool[id].packageId = 0;
+                continue;
             }
+            hwThreadPool[id].coreId = obj->os_index;
         }
-        if (skip)
+        else
         {
-            hwThreadPool[id].coreId = 0;
-            hwThreadPool[id].packageId = 0;
-            continue;
+            hwThreadPool[id].coreId = hwThreadPool[id].apicId % maxNumCoresPerSocket;
         }
-        hwThreadPool[id].coreId = obj->os_index;
-        while (obj->type != socket_type) {
-            obj = obj->parent;
-            if (!obj)
+        if (maxNumSockets > 1)
+        {
+            while (obj->type != socket_type) {
+                obj = obj->parent;
+                if (!obj)
+                {
+                    skip = 1;
+                    break;
+                }
+            }
+            if (skip)
             {
-                skip = 1;
-                break;
+                hwThreadPool[id].packageId = 0;
+                continue;
             }
+            hwThreadPool[id].packageId = obj->os_index;
         }
-        if (skip)
+        else
         {
             hwThreadPool[id].packageId = 0;
-            continue;
         }
-        hwThreadPool[id].packageId = obj->os_index;
-        /*DEBUG_PRINT(DEBUGLEV_DEVELOP, HWLOC Thread Pool PU %d Thread %d Core %d Socket %d,
-                            hwThreadPool[threadIdx].apicId,
-                            hwThreadPool[threadIdx].threadId,
-                            hwThreadPool[threadIdx].coreId,
-                            hwThreadPool[threadIdx].packageId)*/
-        DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
-                                    hwThreadPool[id].apicId, hwThreadPool[id].threadId,
-                                    hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, HWLOC Thread Pool PU %d Thread %d Core %d Socket %d inCpuSet %d,
+                            hwThreadPool[i].apicId,
+                            hwThreadPool[i].threadId,
+                            hwThreadPool[i].coreId,
+                            hwThreadPool[i].packageId,
+                            hwThreadPool[i].inCpuSet)
     }
 
     cpuid_topology.threadPool = hwThreadPool;
diff --git a/src/topology_proc.c b/src/topology_proc.c
index 1d3d0e0..5e09485 100644
--- a/src/topology_proc.c
+++ b/src/topology_proc.c
@@ -6,7 +6,7 @@
  *      Description:  Interface to the procfs/sysfs based topology backend
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
  *                Thomas Roehl (tr), thomas.roehl at googlemail.com
@@ -409,10 +409,10 @@ void proc_init_nodeTopology(cpu_set_t cpuSet)
         hwThreadPool[i].threadId = -1;
         hwThreadPool[i].coreId = -1;
         hwThreadPool[i].packageId = -1;
-        hwThreadPool[i].inCpuSet = 1;
-        if (!CPU_ISSET(i, &cpuSet))
+        hwThreadPool[i].inCpuSet = 0;
+        if (CPU_ISSET(i, &cpuSet))
         {
-            hwThreadPool[i].inCpuSet = 0;
+            hwThreadPool[i].inCpuSet = 1;
         }
         cpudir = bformat("/sys/devices/system/cpu/cpu%d/topology",i);
         file = bformat("%s/core_id", bdata(cpudir));
@@ -439,11 +439,12 @@ void proc_init_nodeTopology(cpu_set_t cpuSet)
             fclose(fp);
         }
         bdestroy(file);
-        DEBUG_PRINT(DEBUGLEV_DEVELOP, PROC Thread Pool PU %d Thread %d Core %d Socket %d,
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, PROC Thread Pool PU %d Thread %d Core %d Socket %d inCpuSet %d,
                             hwThreadPool[i].apicId,
                             hwThreadPool[i].threadId,
                             hwThreadPool[i].coreId,
-                            hwThreadPool[i].packageId)
+                            hwThreadPool[i].packageId,
+                            hwThreadPool[i].inCpuSet)
         bdestroy(cpudir);
     }
     cpuid_topology.threadPool = hwThreadPool;
diff --git a/src/tree.c b/src/tree.c
index 2ac8ab8..e60c9f9 100644
--- a/src/tree.c
+++ b/src/tree.c
@@ -6,7 +6,7 @@
  *      Description:  Module implementing a tree data structure
  *
  *      Version:   4.1
- *      Released:  19.5.2016
+ *      Released:  8.8.2016
  *
  *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
diff --git a/test/MPI_pin_test.c b/test/MPI_pin_test.c
index f0e1271..2f86387 100644
--- a/test/MPI_pin_test.c
+++ b/test/MPI_pin_test.c
@@ -68,10 +68,12 @@ main(int argc, char **argv)
             sprintf(cmd, "pstree -p -H %d %d",pid, pid);
             system(cmd);
         }
+#ifdef _OPENMP
 #pragma omp critical
         {
-            printf ("Rank %d Thread %d running on core %d/%d with pid %d and tid %d\n",rank,omp_get_thread_num(), sched_getcpu(),get_cpu_id(), getpid(),gettid());
+            printf ("Rank %d Thread %d running on Node %s core %d/%d with pid %d and tid %d\n",rank,omp_get_thread_num(), host, sched_getcpu(),get_cpu_id(), getpid(),gettid());
         }
+#endif
 
     }
 
diff --git a/test/accuracy/TESTS/FLOPS_AVX.txt b/test/accuracy/TESTS/FLOPS_AVX.txt
index 7c2ea39..0a27fcd 100644
--- a/test/accuracy/TESTS/FLOPS_AVX.txt
+++ b/test/accuracy/TESTS/FLOPS_AVX.txt
@@ -1,5 +1,5 @@
 REGEX_BENCH MFlops\/s:\s+([0-9]+)
-REGEX_PERF \|\s+Packed DP MFLOP\/s\s+\|\s+([0-9\.e\+\-]+)
+REGEX_PERF \|\s+Packed DP M[Ff][Ll][Oo][Pp][Ss]*\/s\s+\|\s+([0-9\.e\+\-]+)
 
 
 TEST triad_avx
diff --git a/test/accuracy/TESTS/FLOPS_DP.txt b/test/accuracy/TESTS/FLOPS_DP.txt
index 810308b..a24ccef 100644
--- a/test/accuracy/TESTS/FLOPS_DP.txt
+++ b/test/accuracy/TESTS/FLOPS_DP.txt
@@ -1,5 +1,5 @@
 REGEX_BENCH MFlops\/s:\s+([0-9]+)
-REGEX_PERF \|\s+MFLOP\/s\s+\|\s+([0-9\.e\+\-]+)
+REGEX_PERF \|\s+M[Ff][Ll][Oo][Pp][Ss]*\/s\s+\|\s+([0-9\.e\+\-]+)
 
 TEST stream
 RUNS 10
diff --git a/test/accuracy/TESTS/FLOPS_SP.txt b/test/accuracy/TESTS/FLOPS_SP.txt
index 72f2a62..1e2cf10 100644
--- a/test/accuracy/TESTS/FLOPS_SP.txt
+++ b/test/accuracy/TESTS/FLOPS_SP.txt
@@ -1,5 +1,5 @@
 REGEX_BENCH MFlops\/s:\s+([0-9]+)
-REGEX_PERF \|\s+MFLOP\/s\s+\|\s+([0-9\.e\+\-]+)
+REGEX_PERF \|\s+M[Ff][Ll][Oo][Pp][Ss]*\/s\s+\|\s+([0-9\.e\+\-]+)
 
 TEST sum_sp
 RUNS 10
diff --git a/test/accuracy/likwid-accuracy.py b/test/accuracy/likwid-accuracy.py
index 916ed38..f6ec339 100755
--- a/test/accuracy/likwid-accuracy.py
+++ b/test/accuracy/likwid-accuracy.py
@@ -424,7 +424,7 @@ for line in sets:
         testfp = open(filename,'r')
         for line in testfp.read().split("\n"):
             if line.startswith("GROUP"):
-                match = re.match("^GROUP\s+(\.+)")
+                match = re.match("^GROUP\s+(\.+)", line)
                 if match:
                     groupname = match.group(1)
                     break
@@ -486,7 +486,7 @@ if not only_wiki:
     script.write("#!/bin/bash\n")
 
     for group in test_set.keys():
-        perfctr_string = "%s -C E:N:%d:1:2 -g %s -m " % (perfctr,nrThreads, group,)
+        perfctr_string = "%s -f -C E:N:%d:1:2 -g %s -m " % (perfctr,nrThreads, group,)
         no_scale = False
         for test in test_set[group].keys():
             if test.startswith("REGEX"): continue

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/likwid/likwid.git



More information about the Likwid-commit mailing list